diff --git a/deeptools/SES_scaleFactor.py b/deeptools/SES_scaleFactor.py index 2232ca62c..bc45fd42c 100644 --- a/deeptools/SES_scaleFactor.py +++ b/deeptools/SES_scaleFactor.py @@ -8,14 +8,22 @@ from deeptools import bamHandler import deeptools.countReadsPerBin as countR -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") debug = 0 -def estimateScaleFactor(bamFilesList, binLength, numberOfSamples, - normalizationLength, - avg_method='median', blackListFileName=None, numberOfProcessors=1, - verbose=False, chrsToSkip=[], mappingStatsList=[]): +def estimateScaleFactor( + bamFilesList, + binLength, + numberOfSamples, + normalizationLength, + avg_method="median", + blackListFileName=None, + numberOfProcessors=1, + verbose=False, + chrsToSkip=[], + mappingStatsList=[], +): r""" Subdivides the genome into chunks to be analyzed in parallel using several processors. The code handles the creation of @@ -83,20 +91,28 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples, else: mappedReads = [] for fname in bamFilesList: - mappedReads.append(bamHandler.openBam(fname, returnStats=True, nThreads=numberOfProcessors)[1]) - - sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64') - - sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads - - cr = countR.CountReadsPerBin(bamFilesList, - binLength=binLength, - numberOfSamples=numberOfSamples, - extendReads=False, - blackListFileName=blackListFileName, - numberOfProcessors=numberOfProcessors, - verbose=verbose, - chrsToSkip=chrsToSkip) + mappedReads.append( + bamHandler.openBam( + fname, returnStats=True, nThreads=numberOfProcessors + )[1] + ) + + sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype="float64") + + sizeFactorBasedOnMappedReads = ( + sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads + ) + + cr = countR.CountReadsPerBin( + bamFilesList, + binLength=binLength, + numberOfSamples=numberOfSamples, + extendReads=False, + blackListFileName=blackListFileName, + numberOfProcessors=numberOfProcessors, + verbose=verbose, + chrsToSkip=chrsToSkip, + ) try: num_reads_per_bin = cr.run() @@ -127,7 +143,7 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples, # Take a lower rank to move to a region with probably # less peaks and more background. maxIndex = int(maxIndex * 0.8) - while(maxIndex < len(p)): + while maxIndex < len(p): # in rare cases the maxIndex maps to a zero value. # In such cases, the next index is used until # a non zero value appears. @@ -136,8 +152,10 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples, break maxIndex += 1 - meanSES = [np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]), - np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])] + meanSES = [ + np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]), + np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex]), + ] # the maxIndex may be too close to the the signal regions # so i take a more conservative approach by taking a close number @@ -151,9 +169,9 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples, mean = [] std = [] for values in num_reads_per_bin: - maxNumReads = (np.percentile(values, 90)) + maxNumReads = np.percentile(values, 90) if maxNumReads == 0: - maxNumReads = (np.percentile(values, 99)) + maxNumReads = np.percentile(values, 99) if maxNumReads == 0: print("all genomic regions sampled from one ") "of the bam files have no reads.\n" @@ -163,33 +181,38 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples, std.append(np.std(values)) mean = np.array(mean) - readsPerBin = mean if avg_method == 'mean' else median + readsPerBin = mean if avg_method == "mean" else median if min(median) == 0: idx_zero = [ix + 1 for ix, value in enumerate(median) if value == 0] - exit("\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n" - "Try selecting a larger sample size or a region with coverage\n".format(idx_zero)) + exit( + "\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n" + "Try selecting a larger sample size or a region with coverage\n".format( + idx_zero + ) + ) sizeFactor = sizeFactorsSES - return {'size_factors': sizeFactor, - 'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads, - 'size_factors_SES': sizeFactorsSES, - 'size_factors_based_on_mean': mean.min() / mean, - 'size_factors_based_on_median': median.min() / median, - 'mean': mean, - 'meanSES': meanSES, - 'median': median, - 'reads_per_bin': readsPerBin, - 'std': std, - 'sites_sampled': sitesSampled} + return { + "size_factors": sizeFactor, + "size_factors_based_on_mapped_reads": sizeFactorBasedOnMappedReads, + "size_factors_SES": sizeFactorsSES, + "size_factors_based_on_mean": mean.min() / mean, + "size_factors_based_on_median": median.min() / median, + "mean": mean, + "meanSES": meanSES, + "median": median, + "reads_per_bin": readsPerBin, + "std": std, + "sites_sampled": sitesSampled, + } class Tester(object): - def __init__(self): self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/" self.bamFile1 = self.root + "testA.bam" self.bamFile2 = self.root + "testB.bam" global debug debug = 0 - self.chrom = '3R' + self.chrom = "3R" diff --git a/deeptools/_version.py b/deeptools/_version.py index 2c9603d53..96b1b7b8d 100755 --- a/deeptools/_version.py +++ b/deeptools/_version.py @@ -1,5 +1,4 @@ - # This file is originally generated from Git information by running 'setup.py # version'. Distribution tarballs contain a pre-generated copy of this file. -__version__ = '3.5.1' +__version__ = "3.5.1" diff --git a/deeptools/alignmentSieve.py b/deeptools/alignmentSieve.py index 45851958b..0f2832c27 100644 --- a/deeptools/alignmentSieve.py +++ b/deeptools/alignmentSieve.py @@ -15,133 +15,171 @@ def parseArguments(): parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description="This tool filters alignments in a BAM/CRAM file according the the specified parameters. It can optionally output to BEDPE format.", - usage='Example usage: alignmentSieve.py -b sample1.bam -o sample1.filtered.bam --minMappingQuality 10 --filterMetrics log.txt') - - required = parser.add_argument_group('Required arguments') - required.add_argument('--bam', '-b', - metavar='FILE1', - help='An indexed BAM file.', - required=True) - - required.add_argument('--outFile', '-o', - help='The file to write results to. These are the alignments or fragments that pass the filtering criteria.') - - general = parser.add_argument_group('General arguments') - general.add_argument('--numberOfProcessors', '-p', - help='Number of processors to use. Type "max/2" to ' - 'use half the maximum number of processors or "max" ' - 'to use all available processors. (Default: %(default)s)', - metavar="INT", - type=parserCommon.numberOfProcessors, - default=1, - required=False) - - general.add_argument('--filterMetrics', - metavar="FILE.log", - help="The number of entries in total and filtered are saved to this file") - - general.add_argument('--filteredOutReads', - metavar="filtered.bam", - help="If desired, all reads NOT passing the filtering criteria can be written to this file.") - - general.add_argument('--label', '-l', - metavar='sample1', - help='User defined label instead of the default label ' - '(file name).') - - general.add_argument('--smartLabels', - action='store_true', - help='Instead of manually specifying a labels for the input ' - 'file, this causes deepTools to use the file name ' - 'after removing the path and extension.') - - general.add_argument('--verbose', '-v', - help='Set to see processing messages.', - action='store_true') - - general.add_argument('--version', action='version', - version='%(prog)s {}'.format(__version__)) - - general.add_argument('--shift', - nargs='+', - type=int, - help='Shift the left and right end of a read (for BAM files) or a fragment (for BED files). A positive value shift an end to the right (on the + strand) and a negative value shifts a fragment to the left. Either 2 or 4 integers can be provided. For example, "2 -3" will shift the left-most fragment end two bases to the right and the right-most end 3 bases to the left. If 4 integers are provided, then the first and last two refer to fragments whose read 1 is on the left or right, respectively. Consequently, it is possible to take strand into consideration for strand-specific protocols. A fragment whose length falls below 1 due to shifting will not be written to the output. See the online documentation for graphical examples. Note that non-properly-paired reads will be filtered.') - - general.add_argument('--ATACshift', - action='store_true', - help='Shift the produced BAM file or BEDPE regions as commonly done for ATAC-seq. This is equivalent to --shift 4 -5 5 -4.') - - output = parser.add_argument_group('Output arguments') - output.add_argument('--BED', - action='store_true', - help='Instead of producing BAM files, write output in BEDPE format (as defined by MACS2). Note that only reads/fragments passing filtering criterion are written in BEDPE format.') - - filtering = parser.add_argument_group('Optional arguments') - - filtering.add_argument('--filterRNAstrand', - help='Selects RNA-seq reads (single-end or paired-end) in ' - 'the given strand. (Default: %(default)s)', - choices=['forward', 'reverse'], - default=None) - - filtering.add_argument('--ignoreDuplicates', - help='If set, reads that have the same orientation ' - 'and start position will be considered only ' - 'once. If reads are paired, the mate\'s position ' - 'also has to coincide to ignore a read.', - action='store_true') - - filtering.add_argument('--minMappingQuality', - metavar='INT', - help='If set, only reads that have a mapping ' - 'quality score of at least this are ' - 'considered.', - type=int) - - filtering.add_argument('--samFlagInclude', - help='Include reads based on the SAM flag. For example, ' - 'to get only reads that are the first mate, use a flag of 64. ' - 'This is useful to count properly paired reads only once, ' - 'as otherwise the second mate will be also considered for the ' - 'coverage.', - metavar='INT', - default=None, - type=int, - required=False) - - filtering.add_argument('--samFlagExclude', - help='Exclude reads based on the SAM flag. For example, ' - 'to get only reads that map to the forward strand, use ' - '--samFlagExclude 16, where 16 is the SAM flag for reads ' - 'that map to the reverse strand.', - metavar='INT', - default=None, - type=int, - required=False) - - filtering.add_argument('--blackListFileName', '-bl', - help="A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.", - metavar="BED file", - nargs="+", - required=False) - - filtering.add_argument('--minFragmentLength', - help='The minimum fragment length needed for read/pair ' - 'inclusion. This option is primarily useful ' - 'in ATACseq experiments, for filtering mono- or ' - 'di-nucleosome fragments. (Default: %(default)s)', - metavar='INT', - default=0, - type=int, - required=False) - - filtering.add_argument('--maxFragmentLength', - help='The maximum fragment length needed for read/pair ' - 'inclusion. A value of 0 indicates no limit. (Default: %(default)s)', - metavar='INT', - default=0, - type=int, - required=False) + usage="Example usage: alignmentSieve.py -b sample1.bam -o sample1.filtered.bam --minMappingQuality 10 --filterMetrics log.txt", + ) + + required = parser.add_argument_group("Required arguments") + required.add_argument( + "--bam", "-b", metavar="FILE1", help="An indexed BAM file.", required=True + ) + + required.add_argument( + "--outFile", + "-o", + help="The file to write results to. These are the alignments or fragments that pass the filtering criteria.", + ) + + general = parser.add_argument_group("General arguments") + general.add_argument( + "--numberOfProcessors", + "-p", + help='Number of processors to use. Type "max/2" to ' + 'use half the maximum number of processors or "max" ' + "to use all available processors. (Default: %(default)s)", + metavar="INT", + type=parserCommon.numberOfProcessors, + default=1, + required=False, + ) + + general.add_argument( + "--filterMetrics", + metavar="FILE.log", + help="The number of entries in total and filtered are saved to this file", + ) + + general.add_argument( + "--filteredOutReads", + metavar="filtered.bam", + help="If desired, all reads NOT passing the filtering criteria can be written to this file.", + ) + + general.add_argument( + "--label", + "-l", + metavar="sample1", + help="User defined label instead of the default label " "(file name).", + ) + + general.add_argument( + "--smartLabels", + action="store_true", + help="Instead of manually specifying a labels for the input " + "file, this causes deepTools to use the file name " + "after removing the path and extension.", + ) + + general.add_argument( + "--verbose", "-v", help="Set to see processing messages.", action="store_true" + ) + + general.add_argument( + "--version", action="version", version="%(prog)s {}".format(__version__) + ) + + general.add_argument( + "--shift", + nargs="+", + type=int, + help='Shift the left and right end of a read (for BAM files) or a fragment (for BED files). A positive value shift an end to the right (on the + strand) and a negative value shifts a fragment to the left. Either 2 or 4 integers can be provided. For example, "2 -3" will shift the left-most fragment end two bases to the right and the right-most end 3 bases to the left. If 4 integers are provided, then the first and last two refer to fragments whose read 1 is on the left or right, respectively. Consequently, it is possible to take strand into consideration for strand-specific protocols. A fragment whose length falls below 1 due to shifting will not be written to the output. See the online documentation for graphical examples. Note that non-properly-paired reads will be filtered.', + ) + + general.add_argument( + "--ATACshift", + action="store_true", + help="Shift the produced BAM file or BEDPE regions as commonly done for ATAC-seq. This is equivalent to --shift 4 -5 5 -4.", + ) + + output = parser.add_argument_group("Output arguments") + output.add_argument( + "--BED", + action="store_true", + help="Instead of producing BAM files, write output in BEDPE format (as defined by MACS2). Note that only reads/fragments passing filtering criterion are written in BEDPE format.", + ) + + filtering = parser.add_argument_group("Optional arguments") + + filtering.add_argument( + "--filterRNAstrand", + help="Selects RNA-seq reads (single-end or paired-end) in " + "the given strand. (Default: %(default)s)", + choices=["forward", "reverse"], + default=None, + ) + + filtering.add_argument( + "--ignoreDuplicates", + help="If set, reads that have the same orientation " + "and start position will be considered only " + "once. If reads are paired, the mate's position " + "also has to coincide to ignore a read.", + action="store_true", + ) + + filtering.add_argument( + "--minMappingQuality", + metavar="INT", + help="If set, only reads that have a mapping " + "quality score of at least this are " + "considered.", + type=int, + ) + + filtering.add_argument( + "--samFlagInclude", + help="Include reads based on the SAM flag. For example, " + "to get only reads that are the first mate, use a flag of 64. " + "This is useful to count properly paired reads only once, " + "as otherwise the second mate will be also considered for the " + "coverage.", + metavar="INT", + default=None, + type=int, + required=False, + ) + + filtering.add_argument( + "--samFlagExclude", + help="Exclude reads based on the SAM flag. For example, " + "to get only reads that map to the forward strand, use " + "--samFlagExclude 16, where 16 is the SAM flag for reads " + "that map to the reverse strand.", + metavar="INT", + default=None, + type=int, + required=False, + ) + + filtering.add_argument( + "--blackListFileName", + "-bl", + help="A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.", + metavar="BED file", + nargs="+", + required=False, + ) + + filtering.add_argument( + "--minFragmentLength", + help="The minimum fragment length needed for read/pair " + "inclusion. This option is primarily useful " + "in ATACseq experiments, for filtering mono- or " + "di-nucleosome fragments. (Default: %(default)s)", + metavar="INT", + default=0, + type=int, + required=False, + ) + + filtering.add_argument( + "--maxFragmentLength", + help="The maximum fragment length needed for read/pair " + "inclusion. A value of 0 indicates no limit. (Default: %(default)s)", + metavar="INT", + default=0, + type=int, + required=False, + ) return parser @@ -205,10 +243,10 @@ def filterWorker(arglist): chrom, start, end, args, chromDict = arglist fh = openBam(args.bam) - mode = 'wbu' - oname = getTempFileName(suffix='.bam') + mode = "wbu" + oname = getTempFileName(suffix=".bam") if args.filteredOutReads: - onameFiltered = getTempFileName(suffix='.bam') + onameFiltered = getTempFileName(suffix=".bam") else: onameFiltered = None ofh = pysam.AlignmentFile(oname, mode=mode, template=fh) @@ -241,7 +279,10 @@ def filterWorker(arglist): ofiltered.write(read) continue - if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: + if ( + args.samFlagInclude + and read.flag & args.samFlagInclude != args.samFlagInclude + ): nFiltered += 1 if ofiltered: ofiltered.write(read) @@ -274,8 +315,11 @@ def filterWorker(arglist): e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext - if lpos is not None and lpos == read.reference_start \ - and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: + if ( + lpos is not None + and lpos == read.reference_start + and (s, e, read.next_reference_id, read.is_reverse) in prev_pos + ): nFiltered += 1 if ofiltered: ofiltered.write(read) @@ -288,7 +332,7 @@ def filterWorker(arglist): # filterRNAstrand if args.filterRNAstrand: if read.is_paired: - if args.filterRNAstrand == 'forward': + if args.filterRNAstrand == "forward": if read.flag & 144 == 128 or read.flag & 96 == 64: pass else: @@ -296,7 +340,7 @@ def filterWorker(arglist): if ofiltered: ofiltered.write(read) continue - elif args.filterRNAstrand == 'reverse': + elif args.filterRNAstrand == "reverse": if read.flag & 144 == 144 or read.flag & 96 == 96: pass else: @@ -305,7 +349,7 @@ def filterWorker(arglist): ofiltered.write(read) continue else: - if args.filterRNAstrand == 'forward': + if args.filterRNAstrand == "forward": if read.flag & 16 == 16: pass else: @@ -313,7 +357,7 @@ def filterWorker(arglist): if ofiltered: ofiltered.write(read) continue - elif args.filterRNAstrand == 'reverse': + elif args.filterRNAstrand == "reverse": if read.flag & 16 == 0: pass else: @@ -376,18 +420,22 @@ def main(args=None): elif args.ATACshift: args.shift = [4, -5, 5, -4] - bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors) + bam, mapped, unmapped, stats = openBam( + args.bam, returnStats=True, nThreads=args.numberOfProcessors + ) total = mapped + unmapped chrom_sizes = [(x, y) for x, y in zip(bam.references, bam.lengths)] chromDict = {x: y for x, y in zip(bam.references, bam.lengths)} # Filter, writing the results to a bunch of temporary files - res = mapReduce([args, chromDict], - filterWorker, - chrom_sizes, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose) + res = mapReduce( + [args, chromDict], + filterWorker, + chrom_sizes, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + ) res = sorted(res) # The temp files are now in order for concatenation nFiltered = sum([x[3] for x in res]) diff --git a/deeptools/bamCompare.py b/deeptools/bamCompare.py index 9f19321f9..12c3f0ccb 100644 --- a/deeptools/bamCompare.py +++ b/deeptools/bamCompare.py @@ -13,8 +13,9 @@ from deeptools.getRatio import getRatio from deeptools.getScaleFactor import get_num_kept_reads from deeptools.getScaleFactor import get_scale_factor + debug = 0 -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def parseArguments(): @@ -25,28 +26,33 @@ def parseArguments(): optionalArgs = getOptionalArgs() outputParser = parserCommon.output() parser = argparse.ArgumentParser( - parents=[requiredArgs, outputParser, optionalArgs, - parentParser, normalizationParser, bamParser], + parents=[ + requiredArgs, + outputParser, + optionalArgs, + parentParser, + normalizationParser, + bamParser, + ], formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='This tool compares two BAM files based on the number of ' - 'mapped reads. To compare the BAM files, the genome is partitioned ' - 'into bins of equal size, then the number of reads found in each bin' - ' is counted per file, and finally a summary value is ' - 'reported. This value can be the ratio of the number of reads per ' - 'bin, the log2 of the ratio, or the difference. This tool can ' - 'normalize the number of reads in each BAM file using the SES method ' + description="This tool compares two BAM files based on the number of " + "mapped reads. To compare the BAM files, the genome is partitioned " + "into bins of equal size, then the number of reads found in each bin" + " is counted per file, and finally a summary value is " + "reported. This value can be the ratio of the number of reads per " + "bin, the log2 of the ratio, or the difference. This tool can " + "normalize the number of reads in each BAM file using the SES method " 'proposed by Diaz et al. (2012) "Normalization, bias correction, and ' 'peak calling for ChIP-seq". Statistical Applications in Genetics ' - 'and Molecular Biology, 11(3). Normalization based on read counts ' - 'is also available. The output is either a bedgraph or bigWig file ' - 'containing the bin location and the resulting comparison value. ' - 'Note that *each end* in a pair (for paired-end reads) is treated ' - 'independently. If this is undesirable, then use the --samFlagInclude ' - 'or --samFlagExclude options.', - - usage=' bamCompare -b1 treatment.bam -b2 control.bam -o log2ratio.bw', - - add_help=False) + "and Molecular Biology, 11(3). Normalization based on read counts " + "is also available. The output is either a bedgraph or bigWig file " + "containing the bin location and the resulting comparison value. " + "Note that *each end* in a pair (for paired-end reads) is treated " + "independently. If this is undesirable, then use the --samFlagInclude " + "or --samFlagExclude options.", + usage=" bamCompare -b1 treatment.bam -b2 control.bam -o log2ratio.bw", + add_help=False, + ) return parser @@ -54,101 +60,129 @@ def parseArguments(): def getRequiredArgs(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") # define the arguments - required.add_argument('--bamfile1', '-b1', - metavar='BAM file', - help='Sorted BAM file 1. Usually the BAM file ' - 'for the treatment.', - required=True) - - required.add_argument('--bamfile2', '-b2', - metavar='BAM file', - help='Sorted BAM file 2. Usually the BAM ' - 'file for the control.', - required=True) + required.add_argument( + "--bamfile1", + "-b1", + metavar="BAM file", + help="Sorted BAM file 1. Usually the BAM file " "for the treatment.", + required=True, + ) + + required.add_argument( + "--bamfile2", + "-b2", + metavar="BAM file", + help="Sorted BAM file 2. Usually the BAM " "file for the control.", + required=True, + ) return parser def getOptionalArgs(): - parser = argparse.ArgumentParser(add_help=False) - optional = parser.add_argument_group('Optional arguments') - - optional.add_argument("--help", "-h", action="help", - help="show this help message and exit") - - optional.add_argument('--scaleFactorsMethod', - help='Method to use to scale the samples. ' - 'If a method is specified, then it will be used to compensate ' - 'for sequencing depth differences between the samples. ' - 'As an alternative, this can be set to None and an option from ' - '--normalizeUsing can be used. (Default: %(default)s)', - choices=['readCount', 'SES', 'None'], - default='readCount') - - optional.add_argument('--sampleLength', '-l', - help='*Only relevant when SES is chosen for the ' - 'scaleFactorsMethod.* To compute the SES, specify ' - 'the length (in bases) of the regions (see --numberOfSamples) ' - 'that will be randomly sampled to calculate the scaling factors. ' - 'If you do not have a good sequencing depth for ' - 'your samples consider increasing the sampling ' - 'regions\' size to minimize the probability ' - 'that zero-coverage regions are used. (Default: %(default)s)', - default=1000, - type=int) - - optional.add_argument('--numberOfSamples', '-n', - help='*Only relevant when SES is chosen for the ' - 'scaleFactorsMethod.* Number of samplings taken ' - 'from the genome to compute the scaling factors. (Default: %(default)s)', - default=1e5, - type=int) - - optional.add_argument('--scaleFactors', - help='Set this parameter manually to avoid the computation of ' - 'scaleFactors. The format is scaleFactor1:scaleFactor2.' - 'For example, --scaleFactor 0.7:1 will cause the first BAM file to' - 'be multiplied by 0.7, while not scaling ' - 'the second BAM file (multiplication with 1).', - default=None, - required=False) - - optional.add_argument('--operation', - help='The default is to output the log2 ratio of the ' - 'two samples. The reciprocal ratio returns the ' - 'the negative of the inverse of the ratio ' - 'if the ratio is less than 0. The resulting ' - 'values are interpreted as negative fold changes. ' - 'Instead of performing a computation using both files, the scaled signal can ' - 'alternatively be output for the first or second file using ' - 'the \'--operation first\' or \'--operation second\'. (Default: %(default)s)', - default='log2', - choices=['log2', 'ratio', 'subtract', 'add', 'mean', - 'reciprocal_ratio', 'first', 'second'], - required=False) - - optional.add_argument('--pseudocount', - help='A small number to avoid x/0. Only useful ' - 'together with --operation log2 or --operation ratio. ' - 'You can specify different values as pseudocounts for ' - 'the numerator and the denominator by providing two ' - 'values (the first value is used as the numerator ' - 'pseudocount and the second the denominator pseudocount). (Default: %(default)s)', - default=[1], - type=float, - nargs='+', - action=parserCommon.requiredLength(1, 2), - required=False) - - optional.add_argument('--skipZeroOverZero', - help='Skip bins where BOTH BAM files lack coverage. ' - 'This is determined BEFORE any applicable pseudocount ' - 'is added.', - action='store_true') + optional = parser.add_argument_group("Optional arguments") + + optional.add_argument( + "--help", "-h", action="help", help="show this help message and exit" + ) + + optional.add_argument( + "--scaleFactorsMethod", + help="Method to use to scale the samples. " + "If a method is specified, then it will be used to compensate " + "for sequencing depth differences between the samples. " + "As an alternative, this can be set to None and an option from " + "--normalizeUsing can be used. (Default: %(default)s)", + choices=["readCount", "SES", "None"], + default="readCount", + ) + + optional.add_argument( + "--sampleLength", + "-l", + help="*Only relevant when SES is chosen for the " + "scaleFactorsMethod.* To compute the SES, specify " + "the length (in bases) of the regions (see --numberOfSamples) " + "that will be randomly sampled to calculate the scaling factors. " + "If you do not have a good sequencing depth for " + "your samples consider increasing the sampling " + "regions' size to minimize the probability " + "that zero-coverage regions are used. (Default: %(default)s)", + default=1000, + type=int, + ) + + optional.add_argument( + "--numberOfSamples", + "-n", + help="*Only relevant when SES is chosen for the " + "scaleFactorsMethod.* Number of samplings taken " + "from the genome to compute the scaling factors. (Default: %(default)s)", + default=1e5, + type=int, + ) + + optional.add_argument( + "--scaleFactors", + help="Set this parameter manually to avoid the computation of " + "scaleFactors. The format is scaleFactor1:scaleFactor2." + "For example, --scaleFactor 0.7:1 will cause the first BAM file to" + "be multiplied by 0.7, while not scaling " + "the second BAM file (multiplication with 1).", + default=None, + required=False, + ) + + optional.add_argument( + "--operation", + help="The default is to output the log2 ratio of the " + "two samples. The reciprocal ratio returns the " + "the negative of the inverse of the ratio " + "if the ratio is less than 0. The resulting " + "values are interpreted as negative fold changes. " + "Instead of performing a computation using both files, the scaled signal can " + "alternatively be output for the first or second file using " + "the '--operation first' or '--operation second'. (Default: %(default)s)", + default="log2", + choices=[ + "log2", + "ratio", + "subtract", + "add", + "mean", + "reciprocal_ratio", + "first", + "second", + ], + required=False, + ) + + optional.add_argument( + "--pseudocount", + help="A small number to avoid x/0. Only useful " + "together with --operation log2 or --operation ratio. " + "You can specify different values as pseudocounts for " + "the numerator and the denominator by providing two " + "values (the first value is used as the numerator " + "pseudocount and the second the denominator pseudocount). (Default: %(default)s)", + default=[1], + type=float, + nargs="+", + action=parserCommon.requiredLength(1, 2), + required=False, + ) + + optional.add_argument( + "--skipZeroOverZero", + help="Skip bins where BOTH BAM files lack coverage. " + "This is determined BEFORE any applicable pseudocount " + "is added.", + action="store_true", + ) return parser @@ -157,10 +191,11 @@ def process_args(args=None): args = parseArguments().parse_args(args) if args.smoothLength and args.smoothLength <= args.binSize: - print("Warning: the smooth length given ({}) is smaller than the bin " - "size ({}).\n\n No smoothing will be " - "done".format(args.smoothLength, - args.binSize)) + print( + "Warning: the smooth length given ({}) is smaller than the bin " + "size ({}).\n\n No smoothing will be " + "done".format(args.smoothLength, args.binSize) + ) args.smoothLength = None if not args.ignoreForNormalization: @@ -174,39 +209,43 @@ def process_args(args=None): return args + # get_scale_factors function is used for scaling in bamCompare # while get_scale_factor is used for depth normalization def get_scale_factors(args, statsList, mappedList): - if args.scaleFactors: scale_factors = list(map(float, args.scaleFactors.split(":"))) - elif args.scaleFactorsMethod == 'SES': + elif args.scaleFactorsMethod == "SES": scalefactors_dict = estimateScaleFactor( [args.bamfile1, args.bamfile2], - args.sampleLength, args.numberOfSamples, + args.sampleLength, + args.numberOfSamples, 1, mappingStatsList=mappedList, blackListFileName=args.blackListFileName, numberOfProcessors=args.numberOfProcessors, verbose=args.verbose, - chrsToSkip=args.ignoreForNormalization) + chrsToSkip=args.ignoreForNormalization, + ) - scale_factors = scalefactors_dict['size_factors'] + scale_factors = scalefactors_dict["size_factors"] if args.verbose: print("Size factors using SES: {}".format(scale_factors)) - print("%s regions of size %s where used " % - (scalefactors_dict['sites_sampled'], - args.sampleLength)) - - print("ignoring filtering/blacklists, size factors if the number of mapped " - "reads would have been used:") - print(tuple( - float(min(mappedList)) / np.array(mappedList))) - - elif args.scaleFactorsMethod == 'readCount': + print( + "%s regions of size %s where used " + % (scalefactors_dict["sites_sampled"], args.sampleLength) + ) + + print( + "ignoring filtering/blacklists, size factors if the number of mapped " + "reads would have been used:" + ) + print(tuple(float(min(mappedList)) / np.array(mappedList))) + + elif args.scaleFactorsMethod == "readCount": # change the scaleFactor to 1.0 args.scaleFactor = 1.0 # get num of kept reads for bam file 1 @@ -221,10 +260,12 @@ def get_scale_factors(args, statsList, mappedList): # new scale_factors (relative to min of two bams) scale_factors = float(min(bam1_mapped, bam2_mapped)) / np.array(mapped_reads) if args.verbose: - print("Size factors using total number " - "of mapped reads: {}".format(scale_factors)) + print( + "Size factors using total number " + "of mapped reads: {}".format(scale_factors) + ) - elif args.scaleFactorsMethod == 'None': + elif args.scaleFactorsMethod == "None": scale_factors = None return scale_factors @@ -250,16 +291,26 @@ def main(args=None): args = process_args(args) if args.normalizeUsing == "RPGC": - sys.exit("RPGC normalization (--normalizeUsing RPGC) is not supported with bamCompare!") - if args.normalizeUsing == 'None': + sys.exit( + "RPGC normalization (--normalizeUsing RPGC) is not supported with bamCompare!" + ) + if args.normalizeUsing == "None": args.normalizeUsing = None # For the sake of sanity - if args.scaleFactorsMethod != 'None' and args.normalizeUsing: - sys.exit("`--normalizeUsing {}` is only valid if you also use `--scaleFactorsMethod None`! To prevent erroneous output, I will quit now.\n".format(args.normalizeUsing)) + if args.scaleFactorsMethod != "None" and args.normalizeUsing: + sys.exit( + "`--normalizeUsing {}` is only valid if you also use `--scaleFactorsMethod None`! To prevent erroneous output, I will quit now.\n".format( + args.normalizeUsing + ) + ) # Get mapping statistics - bam1, mapped1, unmapped1, stats1 = bamHandler.openBam(args.bamfile1, returnStats=True, nThreads=args.numberOfProcessors) + bam1, mapped1, unmapped1, stats1 = bamHandler.openBam( + args.bamfile1, returnStats=True, nThreads=args.numberOfProcessors + ) bam1.close() - bam2, mapped2, unmapped2, stats2 = bamHandler.openBam(args.bamfile2, returnStats=True, nThreads=args.numberOfProcessors) + bam2, mapped2, unmapped2, stats2 = bamHandler.openBam( + args.bamfile2, returnStats=True, nThreads=args.numberOfProcessors + ) bam2.close() scale_factors = get_scale_factors(args, [stats1, stats2], [mapped1, mapped2]) @@ -282,31 +333,42 @@ def main(args=None): # the getRatio function is called and receives # the func_args per each tile that is considered FUNC = getRatio - func_args = {'valueType': args.operation, - 'scaleFactors': scale_factors, - 'pseudocount': args.pseudocount - } - - wr = writeBedGraph.WriteBedGraph([args.bamfile1, args.bamfile2], args.binSize, 0, - stepSize=args.binSize, - region=args.region, - numberOfProcessors=args.numberOfProcessors, - extendReads=args.extendReads, - blackListFileName=args.blackListFileName, - minMappingQuality=args.minMappingQuality, - ignoreDuplicates=args.ignoreDuplicates, - center_read=args.centerReads, - zerosToNans=args.skipNonCoveredRegions, - skipZeroOverZero=args.skipZeroOverZero, - samFlag_include=args.samFlagInclude, - samFlag_exclude=args.samFlagExclude, - minFragmentLength=args.minFragmentLength, - maxFragmentLength=args.maxFragmentLength, - chrsToSkip=args.ignoreForNormalization, - verbose=args.verbose - ) - - wr.run(FUNC, func_args, args.outFileName, blackListFileName=args.blackListFileName, format=args.outFileFormat, smoothLength=args.smoothLength) + func_args = { + "valueType": args.operation, + "scaleFactors": scale_factors, + "pseudocount": args.pseudocount, + } + + wr = writeBedGraph.WriteBedGraph( + [args.bamfile1, args.bamfile2], + args.binSize, + 0, + stepSize=args.binSize, + region=args.region, + numberOfProcessors=args.numberOfProcessors, + extendReads=args.extendReads, + blackListFileName=args.blackListFileName, + minMappingQuality=args.minMappingQuality, + ignoreDuplicates=args.ignoreDuplicates, + center_read=args.centerReads, + zerosToNans=args.skipNonCoveredRegions, + skipZeroOverZero=args.skipZeroOverZero, + samFlag_include=args.samFlagInclude, + samFlag_exclude=args.samFlagExclude, + minFragmentLength=args.minFragmentLength, + maxFragmentLength=args.maxFragmentLength, + chrsToSkip=args.ignoreForNormalization, + verbose=args.verbose, + ) + + wr.run( + FUNC, + func_args, + args.outFileName, + blackListFileName=args.blackListFileName, + format=args.outFileFormat, + smoothLength=args.smoothLength, + ) if __name__ == "__main__": diff --git a/deeptools/bamCoverage.py b/deeptools/bamCoverage.py index c0002a59d..1593847ad 100644 --- a/deeptools/bamCoverage.py +++ b/deeptools/bamCoverage.py @@ -20,25 +20,30 @@ def parseArguments(): requiredArgs = get_required_args() optionalArgs = get_optional_args() outputParser = parserCommon.output() - parser = \ - argparse.ArgumentParser( - parents=[requiredArgs, outputParser, optionalArgs, - parentParser, normalizationParser, bamParser], - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='This tool takes an alignment of reads or fragments ' - 'as input (BAM file) and generates a coverage track (bigWig or ' - 'bedGraph) as output. ' - 'The coverage is calculated as the number of reads per bin, ' - 'where bins are short consecutive counting windows of a defined ' - 'size. It is possible to extended the length of the reads ' - 'to better reflect the actual fragment length. *bamCoverage* ' - 'offers normalization by scaling factor, Reads Per Kilobase per ' - 'Million mapped reads (RPKM), counts per million (CPM), bins per ' - 'million mapped reads (BPM) and 1x depth (reads per genome ' - 'coverage, RPGC).\n', - usage='An example usage is:' - '$ bamCoverage -b reads.bam -o coverage.bw', - add_help=False) + parser = argparse.ArgumentParser( + parents=[ + requiredArgs, + outputParser, + optionalArgs, + parentParser, + normalizationParser, + bamParser, + ], + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="This tool takes an alignment of reads or fragments " + "as input (BAM file) and generates a coverage track (bigWig or " + "bedGraph) as output. " + "The coverage is calculated as the number of reads per bin, " + "where bins are short consecutive counting windows of a defined " + "size. It is possible to extended the length of the reads " + "to better reflect the actual fragment length. *bamCoverage* " + "offers normalization by scaling factor, Reads Per Kilobase per " + "Million mapped reads (RPKM), counts per million (CPM), bins per " + "million mapped reads (BPM) and 1x depth (reads per genome " + "coverage, RPGC).\n", + usage="An example usage is:" "$ bamCoverage -b reads.bam -o coverage.bw", + add_help=False, + ) return parser @@ -46,70 +51,77 @@ def parseArguments(): def get_required_args(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") # define the arguments - required.add_argument('--bam', '-b', - help='BAM file to process', - metavar='BAM file', - required=True) + required.add_argument( + "--bam", "-b", help="BAM file to process", metavar="BAM file", required=True + ) return parser def get_optional_args(): - parser = argparse.ArgumentParser(add_help=False) - optional = parser.add_argument_group('Optional arguments') - - optional.add_argument("--help", "-h", action="help", - help="show this help message and exit") - - optional.add_argument('--scaleFactor', - help='The computed scaling factor (or 1, if not applicable) will ' - 'be multiplied by this. (Default: %(default)s)', - default=1.0, - type=float, - required=False) - - optional.add_argument('--MNase', - help='Determine nucleosome positions from MNase-seq data. ' - 'Only 3 nucleotides at the center of each fragment are counted. ' - 'The fragment ends are defined by the two mate reads. Only fragment lengths' - 'between 130 - 200 bp are considered to avoid dinucleosomes or other artifacts. ' - 'By default, any fragments smaller or larger than this are ignored. To ' - 'over-ride this, use the --minFragmentLength and --maxFragmentLength options, ' - 'which will default to 130 and 200 if not otherwise specified in the presence ' - 'of --MNase. *NOTE*: Requires paired-end data. A bin size of 1 is recommended.', - action='store_true') - - optional.add_argument('--Offset', - help='Uses this offset inside of each read as the signal. This is useful in ' - 'cases like RiboSeq or GROseq, where the signal is 12, 15 or 0 bases past the ' - 'start of the read. This can be paired with the --filterRNAstrand option. ' - 'Note that negative values indicate offsets from the end of each read. A value ' - 'of 1 indicates the first base of the alignment (taking alignment orientation ' - 'into account). Likewise, a value of -1 is the last base of the alignment. An ' - 'offset of 0 is not permitted. If two values are specified, then they will be ' - 'used to specify a range of positions. Note that specifying something like ' - '--Offset 5 -1 will result in the 5th through last position being used, which ' - 'is equivalent to trimming 4 bases from the 5-prime end of alignments. Note ' - 'that if you specify --centerReads, the centering will be performed before the ' - 'offset.', - metavar='INT', - type=int, - nargs='+', - required=False) - - optional.add_argument('--filterRNAstrand', - help='Selects RNA-seq reads (single-end or paired-end) originating from genes ' - 'on the given strand. This option assumes a standard dUTP-based library ' - 'preparation (that is, --filterRNAstrand=forward keeps minus-strand reads, ' - 'which originally came from genes on the forward strand using a dUTP-based ' - 'method). Consider using --samExcludeFlag instead for filtering by strand in ' - 'other contexts.', - choices=['forward', 'reverse'], - default=None) + optional = parser.add_argument_group("Optional arguments") + + optional.add_argument( + "--help", "-h", action="help", help="show this help message and exit" + ) + + optional.add_argument( + "--scaleFactor", + help="The computed scaling factor (or 1, if not applicable) will " + "be multiplied by this. (Default: %(default)s)", + default=1.0, + type=float, + required=False, + ) + + optional.add_argument( + "--MNase", + help="Determine nucleosome positions from MNase-seq data. " + "Only 3 nucleotides at the center of each fragment are counted. " + "The fragment ends are defined by the two mate reads. Only fragment lengths" + "between 130 - 200 bp are considered to avoid dinucleosomes or other artifacts. " + "By default, any fragments smaller or larger than this are ignored. To " + "over-ride this, use the --minFragmentLength and --maxFragmentLength options, " + "which will default to 130 and 200 if not otherwise specified in the presence " + "of --MNase. *NOTE*: Requires paired-end data. A bin size of 1 is recommended.", + action="store_true", + ) + + optional.add_argument( + "--Offset", + help="Uses this offset inside of each read as the signal. This is useful in " + "cases like RiboSeq or GROseq, where the signal is 12, 15 or 0 bases past the " + "start of the read. This can be paired with the --filterRNAstrand option. " + "Note that negative values indicate offsets from the end of each read. A value " + "of 1 indicates the first base of the alignment (taking alignment orientation " + "into account). Likewise, a value of -1 is the last base of the alignment. An " + "offset of 0 is not permitted. If two values are specified, then they will be " + "used to specify a range of positions. Note that specifying something like " + "--Offset 5 -1 will result in the 5th through last position being used, which " + "is equivalent to trimming 4 bases from the 5-prime end of alignments. Note " + "that if you specify --centerReads, the centering will be performed before the " + "offset.", + metavar="INT", + type=int, + nargs="+", + required=False, + ) + + optional.add_argument( + "--filterRNAstrand", + help="Selects RNA-seq reads (single-end or paired-end) originating from genes " + "on the given strand. This option assumes a standard dUTP-based library " + "preparation (that is, --filterRNAstrand=forward keeps minus-strand reads, " + "which originally came from genes on the forward strand using a dUTP-based " + "method). Consider using --samExcludeFlag instead for filtering by strand in " + "other contexts.", + choices=["forward", "reverse"], + default=None, + ) return parser @@ -121,7 +133,8 @@ def scaleFactor(string): except: raise argparse.ArgumentTypeError( "Format of scaleFactors is factor1:factor2. " - "The value given ( {} ) is not valid".format(string)) + "The value given ( {} ) is not valid".format(string) + ) return scalefactors @@ -130,8 +143,12 @@ def process_args(args=None): args = parseArguments().parse_args(args) if args.smoothLength and args.smoothLength <= args.binSize: - print("Warning: the smooth length given ({}) is smaller than the bin " - "size ({}).\n\n No smoothing will be done".format(args.smoothLength, args.binSize)) + print( + "Warning: the smooth length given ({}) is smaller than the bin " + "size ({}).\n\n No smoothing will be done".format( + args.smoothLength, args.binSize + ) + ) args.smoothLength = None if not args.ignoreForNormalization: @@ -150,20 +167,22 @@ def main(args=None): else: debug = 0 - if args.normalizeUsing == 'None': + if args.normalizeUsing == "None": args.normalizeUsing = None # For the sake of sanity - elif args.normalizeUsing == 'RPGC' and not args.effectiveGenomeSize: + elif args.normalizeUsing == "RPGC" and not args.effectiveGenomeSize: sys.exit("RPGC normalization requires an --effectiveGenomeSize!\n") if args.normalizeUsing: # if a normalization is required then compute the scale factors - bam, mapped, unmapped, stats = openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors) + bam, mapped, unmapped, stats = openBam( + args.bam, returnStats=True, nThreads=args.numberOfProcessors + ) bam.close() scale_factor = get_scale_factor(args, stats) else: scale_factor = args.scaleFactor - func_args = {'scaleFactor': scale_factor} + func_args = {"scaleFactor": scale_factor} # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used if args.filterRNAstrand and not args.Offset: @@ -173,13 +192,18 @@ def main(args=None): # check that library is paired end # using getFragmentAndReadSize from deeptools.getFragmentAndReadSize import get_read_and_fragment_length - frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, - return_lengths=False, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose) + + frag_len_dict, read_len_dict = get_read_and_fragment_length( + args.bam, + return_lengths=False, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + ) if frag_len_dict is None: - sys.exit("*Error*: For the --MNAse function a paired end library is required. ") + sys.exit( + "*Error*: For the --MNAse function a paired end library is required. " + ) # Set some default fragment length bounds if args.minFragmentLength == 0: @@ -187,81 +211,97 @@ def main(args=None): if args.maxFragmentLength == 0: args.maxFragmentLength = 200 - wr = CenterFragment([args.bam], - binLength=args.binSize, - stepSize=args.binSize, - region=args.region, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - extendReads=args.extendReads, - minMappingQuality=args.minMappingQuality, - ignoreDuplicates=args.ignoreDuplicates, - center_read=args.centerReads, - zerosToNans=args.skipNonCoveredRegions, - samFlag_include=args.samFlagInclude, - samFlag_exclude=args.samFlagExclude, - minFragmentLength=args.minFragmentLength, - maxFragmentLength=args.maxFragmentLength, - chrsToSkip=args.ignoreForNormalization, - verbose=args.verbose, - ) + wr = CenterFragment( + [args.bam], + binLength=args.binSize, + stepSize=args.binSize, + region=args.region, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + extendReads=args.extendReads, + minMappingQuality=args.minMappingQuality, + ignoreDuplicates=args.ignoreDuplicates, + center_read=args.centerReads, + zerosToNans=args.skipNonCoveredRegions, + samFlag_include=args.samFlagInclude, + samFlag_exclude=args.samFlagExclude, + minFragmentLength=args.minFragmentLength, + maxFragmentLength=args.maxFragmentLength, + chrsToSkip=args.ignoreForNormalization, + verbose=args.verbose, + ) elif args.Offset: if len(args.Offset) > 1: if args.Offset[0] == 0: - sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.") + sys.exit( + "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment." + ) if args.Offset[1] > 0 and args.Offset[1] < args.Offset[0]: - sys.exir("'Error*: The right side bound is less than the left-side bound. This is inappropriate.") + sys.exir( + "'Error*: The right side bound is less than the left-side bound. This is inappropriate." + ) else: if args.Offset[0] == 0: - sys.exit("*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment.") - wr = OffsetFragment([args.bam], - binLength=args.binSize, - stepSize=args.binSize, - region=args.region, - numberOfProcessors=args.numberOfProcessors, - extendReads=args.extendReads, - minMappingQuality=args.minMappingQuality, - ignoreDuplicates=args.ignoreDuplicates, - center_read=args.centerReads, - zerosToNans=args.skipNonCoveredRegions, - samFlag_include=args.samFlagInclude, - samFlag_exclude=args.samFlagExclude, - minFragmentLength=args.minFragmentLength, - maxFragmentLength=args.maxFragmentLength, - chrsToSkip=args.ignoreForNormalization, - verbose=args.verbose) + sys.exit( + "*Error*: An offset of 0 isn't allowed, since offsets are 1-based positions inside each alignment." + ) + wr = OffsetFragment( + [args.bam], + binLength=args.binSize, + stepSize=args.binSize, + region=args.region, + numberOfProcessors=args.numberOfProcessors, + extendReads=args.extendReads, + minMappingQuality=args.minMappingQuality, + ignoreDuplicates=args.ignoreDuplicates, + center_read=args.centerReads, + zerosToNans=args.skipNonCoveredRegions, + samFlag_include=args.samFlagInclude, + samFlag_exclude=args.samFlagExclude, + minFragmentLength=args.minFragmentLength, + maxFragmentLength=args.maxFragmentLength, + chrsToSkip=args.ignoreForNormalization, + verbose=args.verbose, + ) wr.filter_strand = args.filterRNAstrand wr.Offset = args.Offset else: - wr = writeBedGraph.WriteBedGraph([args.bam], - binLength=args.binSize, - stepSize=args.binSize, - region=args.region, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - extendReads=args.extendReads, - minMappingQuality=args.minMappingQuality, - ignoreDuplicates=args.ignoreDuplicates, - center_read=args.centerReads, - zerosToNans=args.skipNonCoveredRegions, - samFlag_include=args.samFlagInclude, - samFlag_exclude=args.samFlagExclude, - minFragmentLength=args.minFragmentLength, - maxFragmentLength=args.maxFragmentLength, - chrsToSkip=args.ignoreForNormalization, - verbose=args.verbose, - ) - - wr.run(writeBedGraph.scaleCoverage, func_args, args.outFileName, - blackListFileName=args.blackListFileName, - format=args.outFileFormat, smoothLength=args.smoothLength) + wr = writeBedGraph.WriteBedGraph( + [args.bam], + binLength=args.binSize, + stepSize=args.binSize, + region=args.region, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + extendReads=args.extendReads, + minMappingQuality=args.minMappingQuality, + ignoreDuplicates=args.ignoreDuplicates, + center_read=args.centerReads, + zerosToNans=args.skipNonCoveredRegions, + samFlag_include=args.samFlagInclude, + samFlag_exclude=args.samFlagExclude, + minFragmentLength=args.minFragmentLength, + maxFragmentLength=args.maxFragmentLength, + chrsToSkip=args.ignoreForNormalization, + verbose=args.verbose, + ) + + wr.run( + writeBedGraph.scaleCoverage, + func_args, + args.outFileName, + blackListFileName=args.blackListFileName, + format=args.outFileFormat, + smoothLength=args.smoothLength, + ) class OffsetFragment(writeBedGraph.WriteBedGraph): """ Class to redefine the get_fragment_from_read for the --Offset case """ + def filterStrand(self, read, rv): """ A generic read filtering function that gets used by everything in this class. @@ -270,19 +310,19 @@ def filterStrand(self, read, rv): """ # Filter by RNA strand, if desired if read.is_paired: - if self.filter_strand == 'forward': + if self.filter_strand == "forward": if read.flag & 144 == 128 or read.flag & 96 == 64: return rv - elif self.filter_strand == 'reverse': + elif self.filter_strand == "reverse": if read.flag & 144 == 144 or read.flag & 96 == 96: return rv else: return rv else: - if self.filter_strand == 'forward': + if self.filter_strand == "forward": if read.flag & 16 == 16: return rv - elif self.filter_strand == 'reverse': + elif self.filter_strand == "reverse": if read.flag & 16 == 0: return rv else: @@ -298,27 +338,42 @@ def get_fragment_from_read_list(self, read, offset): blocks = read.get_blocks() blockLen = sum([x[1] - x[0] for x in blocks]) - if self.defaultFragmentLength != 'read length': + if self.defaultFragmentLength != "read length": if self.is_proper_pair(read, self.maxPairedFragmentLength): if read.is_reverse: foo = (read.next_reference_start, read.reference_start) if foo[0] < foo[1]: blocks.insert(0, foo) else: - foo = (read.reference_end, read.reference_end + abs(read.template_length) - read.infer_query_length()) + foo = ( + read.reference_end, + read.reference_end + + abs(read.template_length) + - read.infer_query_length(), + ) if foo[0] < foo[1]: blocks.append(foo) # Extend using the default fragment length else: if read.is_reverse: - foo = (read.reference_start - self.defaultFragmentLength + read.infer_query_length(), read.reference_start) + foo = ( + read.reference_start + - self.defaultFragmentLength + + read.infer_query_length(), + read.reference_start, + ) if foo[0] < 0: foo = (0, foo[1]) if foo[0] < foo[1]: blocks.insert(0, foo) else: - foo = (read.reference_end, read.reference_end + self.defaultFragmentLength - read.infer_query_length()) + foo = ( + read.reference_end, + read.reference_end + + self.defaultFragmentLength + - read.infer_query_length(), + ) if foo[0] < foo[1]: blocks.append(foo) @@ -333,11 +388,11 @@ def get_fragment_from_read_list(self, read, offset): # Handle --centerReads if self.center_read: _ = (len(stretch) - blockLen) // 2 - stretch = stretch[_:_ + blockLen] + stretch = stretch[_ : _ + blockLen] # Subset by --Offset try: - foo = stretch[offset[0]:offset[1]] + foo = stretch[offset[0] : offset[1]] except: return rv @@ -349,7 +404,9 @@ def get_fragment_from_read_list(self, read, offset): # Convert the stretch back to a list of tuples foo = np.array(foo) d = foo[1:] - foo[:-1] - idx = np.argwhere(d > 1).flatten().tolist() # This now holds the interval bounds as a list + idx = ( + np.argwhere(d > 1).flatten().tolist() + ) # This now holds the interval bounds as a list idx.append(-1) last = 0 rv = [] @@ -393,6 +450,7 @@ class CenterFragment(writeBedGraph.WriteBedGraph): The coverage of the fragment is defined as the 2 or 3 basepairs at the center of the fragment length. """ + def get_fragment_from_read(self, read): """ Takes a proper pair fragment of high quality and limited diff --git a/deeptools/bamHandler.py b/deeptools/bamHandler.py index cb8b424ff..be37bcafb 100644 --- a/deeptools/bamHandler.py +++ b/deeptools/bamHandler.py @@ -29,7 +29,9 @@ def getMappingStats(bam, nThreads): This requires pysam > 0.13.0 """ header = [(x, y) for x, y in zip(bam.references, bam.lengths)] - res = mapReduce([bam.filename, False], countReadsInInterval, header, numberOfProcessors=nThreads) + res = mapReduce( + [bam.filename, False], countReadsInInterval, header, numberOfProcessors=nThreads + ) mapped = sum([x[0] for x in res]) unmapped = sum([x[1] for x in res]) @@ -70,16 +72,20 @@ def openBam(bamFile, returnStats=False, nThreads=1, minimalDecoding=True): if not minimalDecoding: format_options = None try: - bam = pysam.Samfile(bamFile, 'rb', format_options=format_options) + bam = pysam.Samfile(bamFile, "rb", format_options=format_options) except IOError: sys.exit("The file '{}' does not exist".format(bamFile)) except: sys.exit("The file '{}' does not have BAM or CRAM format ".format(bamFile)) try: - assert(bam.check_index() is not False) + assert bam.check_index() is not False except: - sys.exit("'{}' does not appear to have an index. You MUST index the file first!".format(bamFile)) + sys.exit( + "'{}' does not appear to have an index. You MUST index the file first!".format( + bamFile + ) + ) if bam.is_cram and returnStats: mapped, unmapped, stats = getMappingStats(bam, nThreads) @@ -89,13 +95,18 @@ def openBam(bamFile, returnStats=False, nThreads=1, minimalDecoding=True): # Make the dictionary to hold the stats if returnStats: - stats = {chrom.contig: [chrom.mapped, chrom.unmapped] for chrom in bam.get_index_statistics()} + stats = { + chrom.contig: [chrom.mapped, chrom.unmapped] + for chrom in bam.get_index_statistics() + } if bam.is_bam or (bam.is_cram and returnStats): if mapped == 0: - sys.stderr.write("WARNING! '{}' does not have any mapped reads. Please " - "check that the file is properly indexed and " - "that it contains mapped reads.\n".format(bamFile)) + sys.stderr.write( + "WARNING! '{}' does not have any mapped reads. Please " + "check that the file is properly indexed and " + "that it contains mapped reads.\n".format(bamFile) + ) if returnStats: return bam, mapped, unmapped, stats diff --git a/deeptools/bamPEFragmentSize.py b/deeptools/bamPEFragmentSize.py index 646b51cae..341b76c99 100755 --- a/deeptools/bamPEFragmentSize.py +++ b/deeptools/bamPEFragmentSize.py @@ -6,9 +6,10 @@ import numpy as np import matplotlib -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['svg.fonttype'] = 'none' + +matplotlib.use("Agg") +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["svg.fonttype"] = "none" from deeptools import cm # noqa: F401 import matplotlib.pyplot as plt @@ -23,93 +24,127 @@ def parse_arguments(): parser = argparse.ArgumentParser( - description='This tool calculates the fragment sizes for read pairs given a BAM file from paired-end sequencing.' - 'Several regions are sampled depending on the ' - 'size of the genome and number of processors to estimate the' - 'summary statistics on the fragment lengths. ' - 'Properly paired reads are preferred for computation, i.e., ' - 'it will only use discordant pairs if no concordant alignments ' - 'overlap with a given region. ' - 'The default setting simply prints the summary statistics to the screen.') - parser.add_argument('--bamfiles', '-b', - help='List of BAM files to process', - nargs='+', - metavar='bam files') - - parser.add_argument('--histogram', '-hist', '-o', - help='Save a .png file with a histogram ' - 'of the fragment length distribution.', - metavar='FILE') - - parser.add_argument('--plotFileFormat', - metavar='FILETYPE', - help='Image format type. If given, this option ' - 'overrides the image format based on the plotFile ' - 'ending. The available options are: png, ' - 'eps, pdf, svg and plotly.', - default=None, - choices=['png', 'pdf', 'svg', 'eps', 'plotly']) - - parser.add_argument('--numberOfProcessors', '-p', - help='Number of processors to use. The default is ' - 'to use 1. (Default: %(default)s)', - metavar="INT", - type=int, - default=1, - required=False) - parser.add_argument('--samplesLabel', - help='Labels for the samples plotted. The ' - 'default is to use the file name of the ' - 'sample. The sample labels should be separated ' - 'by spaces and quoted if a label itself' - 'contains a space E.g. --samplesLabel label-1 "label 2" ', - nargs='+') - parser.add_argument('--plotTitle', '-T', - help='Title of the plot, to be printed on top of ' - 'the generated image. Leave blank for no title. (Default: %(default)s)', - default='') - parser.add_argument('--maxFragmentLength', - help='The maximum fragment length in the histogram. A value of 0 (the default) indicates to use twice the mean fragment length. (Default: %(default)s)', - default=0, - type=int) - parser.add_argument('--logScale', - help='Plot on the log scale', - action='store_true') - parser.add_argument('--binSize', '-bs', - metavar='INT', - help='Length in bases of the window used to sample the genome. (Default: %(default)s)', - default=1000, - type=int) - parser.add_argument('--distanceBetweenBins', '-n', - metavar='INT', - help='To reduce the computation time, not every possible genomic ' - 'bin is sampled. This option allows you to set the distance ' - 'between bins actually sampled from. Larger numbers are sufficient ' - 'for high coverage samples, while smaller values are useful for ' - 'lower coverage samples. Note that if you specify a value that ' - 'results in too few (<1000) reads sampled, the value will be ' - 'decreased. (Default: %(default)s)', - default=1000000, - type=int) - parser.add_argument('--blackListFileName', '-bl', - help="A BED file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered.", - metavar="BED file", - required=False) - parser.add_argument('--table', - metavar='FILE', - help='In addition to printing read and fragment length metrics to the screen, write them to the given file in tabular format.', - required=False) - parser.add_argument('--outRawFragmentLengths', - metavar='FILE', - required=False, - type=writableFile, - help='Save the fragment (or read if the input is single-end) length and their associated number of occurrences to a tab-separated file. Columns are length, number of occurrences, and the sample label.') - parser.add_argument('--verbose', - help='Set if processing data messages are wanted.', - action='store_true', - required=False) - parser.add_argument('--version', action='version', - version='%(prog)s {}'.format(__version__)) + description="This tool calculates the fragment sizes for read pairs given a BAM file from paired-end sequencing." + "Several regions are sampled depending on the " + "size of the genome and number of processors to estimate the" + "summary statistics on the fragment lengths. " + "Properly paired reads are preferred for computation, i.e., " + "it will only use discordant pairs if no concordant alignments " + "overlap with a given region. " + "The default setting simply prints the summary statistics to the screen." + ) + parser.add_argument( + "--bamfiles", + "-b", + help="List of BAM files to process", + nargs="+", + metavar="bam files", + ) + + parser.add_argument( + "--histogram", + "-hist", + "-o", + help="Save a .png file with a histogram " + "of the fragment length distribution.", + metavar="FILE", + ) + + parser.add_argument( + "--plotFileFormat", + metavar="FILETYPE", + help="Image format type. If given, this option " + "overrides the image format based on the plotFile " + "ending. The available options are: png, " + "eps, pdf, svg and plotly.", + default=None, + choices=["png", "pdf", "svg", "eps", "plotly"], + ) + + parser.add_argument( + "--numberOfProcessors", + "-p", + help="Number of processors to use. The default is " + "to use 1. (Default: %(default)s)", + metavar="INT", + type=int, + default=1, + required=False, + ) + parser.add_argument( + "--samplesLabel", + help="Labels for the samples plotted. The " + "default is to use the file name of the " + "sample. The sample labels should be separated " + "by spaces and quoted if a label itself" + 'contains a space E.g. --samplesLabel label-1 "label 2" ', + nargs="+", + ) + parser.add_argument( + "--plotTitle", + "-T", + help="Title of the plot, to be printed on top of " + "the generated image. Leave blank for no title. (Default: %(default)s)", + default="", + ) + parser.add_argument( + "--maxFragmentLength", + help="The maximum fragment length in the histogram. A value of 0 (the default) indicates to use twice the mean fragment length. (Default: %(default)s)", + default=0, + type=int, + ) + parser.add_argument("--logScale", help="Plot on the log scale", action="store_true") + parser.add_argument( + "--binSize", + "-bs", + metavar="INT", + help="Length in bases of the window used to sample the genome. (Default: %(default)s)", + default=1000, + type=int, + ) + parser.add_argument( + "--distanceBetweenBins", + "-n", + metavar="INT", + help="To reduce the computation time, not every possible genomic " + "bin is sampled. This option allows you to set the distance " + "between bins actually sampled from. Larger numbers are sufficient " + "for high coverage samples, while smaller values are useful for " + "lower coverage samples. Note that if you specify a value that " + "results in too few (<1000) reads sampled, the value will be " + "decreased. (Default: %(default)s)", + default=1000000, + type=int, + ) + parser.add_argument( + "--blackListFileName", + "-bl", + help="A BED file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered.", + metavar="BED file", + required=False, + ) + parser.add_argument( + "--table", + metavar="FILE", + help="In addition to printing read and fragment length metrics to the screen, write them to the given file in tabular format.", + required=False, + ) + parser.add_argument( + "--outRawFragmentLengths", + metavar="FILE", + required=False, + type=writableFile, + help="Save the fragment (or read if the input is single-end) length and their associated number of occurrences to a tab-separated file. Columns are length, number of occurrences, and the sample label.", + ) + parser.add_argument( + "--verbose", + help="Set if processing data messages are wanted.", + action="store_true", + required=False, + ) + parser.add_argument( + "--version", action="version", version="%(prog)s {}".format(__version__) + ) return parser @@ -125,23 +160,33 @@ def getDensity(lengths, minVal, maxVal): def getFragSize(bam, args, idx, outRawFrags): - fragment_len_dict, read_len_dict = get_read_and_fragment_length(bam, return_lengths=True, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose, - binSize=args.binSize, - distanceBetweenBins=args.distanceBetweenBins) + fragment_len_dict, read_len_dict = get_read_and_fragment_length( + bam, + return_lengths=True, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + binSize=args.binSize, + distanceBetweenBins=args.distanceBetweenBins, + ) if outRawFrags: label = bam if args.samplesLabel and idx < len(args.samplesLabel): label = args.samplesLabel[idx] if fragment_len_dict: - fragment_len_dict['lengths'] = [int(x) for x in fragment_len_dict['lengths']] - cnts = np.bincount(fragment_len_dict['lengths'], minlength=int(fragment_len_dict['max']) + 1) + fragment_len_dict["lengths"] = [ + int(x) for x in fragment_len_dict["lengths"] + ] + cnts = np.bincount( + fragment_len_dict["lengths"], + minlength=int(fragment_len_dict["max"]) + 1, + ) else: - read_len_dict['lengths'] = [int(x) for x in read_len_dict['lengths']] - cnts = np.bincount(read_len_dict['lengths'], minlength=int(read_len_dict['max']) + 1) + read_len_dict["lengths"] = [int(x) for x in read_len_dict["lengths"]] + cnts = np.bincount( + read_len_dict["lengths"], minlength=int(read_len_dict["max"]) + 1 + ) for idx, v in enumerate(cnts): if v > 0: outRawFrags.write("{}\t{}\t{}\n".format(idx, v, label)) @@ -152,74 +197,98 @@ def getFragSize(bam, args, idx, outRawFrags): print("\n\nBAM file : {}".format(bam)) if fragment_len_dict: - if fragment_len_dict['mean'] == 0: - print("No pairs were found. Is the data from a paired-end sequencing experiment?") + if fragment_len_dict["mean"] == 0: + print( + "No pairs were found. Is the data from a paired-end sequencing experiment?" + ) - print("Sample size: {}\n".format(fragment_len_dict['sample_size'])) + print("Sample size: {}\n".format(fragment_len_dict["sample_size"])) print("Fragment lengths:") - print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" - "3rd Qu.: {}\nMax.: {}\nStd: {}".format(fragment_len_dict['min'], - fragment_len_dict['qtile25'], - fragment_len_dict['mean'], - fragment_len_dict['median'], - fragment_len_dict['qtile75'], - fragment_len_dict['max'], - fragment_len_dict['std'])) - print("MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format(fragment_len_dict['mad'], - fragment_len_dict['qtile10'], - fragment_len_dict['qtile20'], - fragment_len_dict['qtile30'], - fragment_len_dict['qtile40'], - fragment_len_dict['qtile60'], - fragment_len_dict['qtile70'], - fragment_len_dict['qtile80'], - fragment_len_dict['qtile90'], - fragment_len_dict['qtile99'])) + print( + "Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" + "3rd Qu.: {}\nMax.: {}\nStd: {}".format( + fragment_len_dict["min"], + fragment_len_dict["qtile25"], + fragment_len_dict["mean"], + fragment_len_dict["median"], + fragment_len_dict["qtile75"], + fragment_len_dict["max"], + fragment_len_dict["std"], + ) + ) + print( + "MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format( + fragment_len_dict["mad"], + fragment_len_dict["qtile10"], + fragment_len_dict["qtile20"], + fragment_len_dict["qtile30"], + fragment_len_dict["qtile40"], + fragment_len_dict["qtile60"], + fragment_len_dict["qtile70"], + fragment_len_dict["qtile80"], + fragment_len_dict["qtile90"], + fragment_len_dict["qtile99"], + ) + ) else: - print("No pairs were found. Is the data from a paired-end sequencing experiment?") + print( + "No pairs were found. Is the data from a paired-end sequencing experiment?" + ) print("\nRead lengths:") - print("Sample size: {}\n".format(read_len_dict['sample_size'])) - print("Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" - "3rd Qu.: {}\nMax.: {}\nStd: {}".format(read_len_dict['min'], - read_len_dict['qtile25'], - read_len_dict['mean'], - read_len_dict['median'], - read_len_dict['qtile75'], - read_len_dict['max'], - read_len_dict['std'])) - print("MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format(read_len_dict['mad'], - read_len_dict['qtile10'], - read_len_dict['qtile20'], - read_len_dict['qtile30'], - read_len_dict['qtile40'], - read_len_dict['qtile60'], - read_len_dict['qtile70'], - read_len_dict['qtile80'], - read_len_dict['qtile90'], - read_len_dict['qtile99'])) + print("Sample size: {}\n".format(read_len_dict["sample_size"])) + print( + "Min.: {}\n1st Qu.: {}\nMean: {}\nMedian: {}\n" + "3rd Qu.: {}\nMax.: {}\nStd: {}".format( + read_len_dict["min"], + read_len_dict["qtile25"], + read_len_dict["mean"], + read_len_dict["median"], + read_len_dict["qtile75"], + read_len_dict["max"], + read_len_dict["std"], + ) + ) + print( + "MAD: {}\nLen. 10%: {}\nLen. 20%: {}\nLen. 30%: {}\nLen. 40%: {}\nLen. 60%: {}\nLen. 70%: {}\nLen. 80%: {}\nLen. 90%: {}\nLen. 99%: {}\n".format( + read_len_dict["mad"], + read_len_dict["qtile10"], + read_len_dict["qtile20"], + read_len_dict["qtile30"], + read_len_dict["qtile40"], + read_len_dict["qtile60"], + read_len_dict["qtile70"], + read_len_dict["qtile80"], + read_len_dict["qtile90"], + read_len_dict["qtile99"], + ) + ) # The read and fragment lists will just eat up memory if not removed! if args.histogram: if fragment_len_dict: - maxVal = fragment_len_dict['mean'] * 2 - minVal = fragment_len_dict['min'] + maxVal = fragment_len_dict["mean"] * 2 + minVal = fragment_len_dict["min"] else: - maxVal = read_len_dict['mean'] * 2 - minVal = read_len_dict['min'] + maxVal = read_len_dict["mean"] * 2 + minVal = read_len_dict["min"] if args.maxFragmentLength > 0: maxVal = args.maxFragmentLength if fragment_len_dict: - fragment_len_dict['lengths'] = getDensity(fragment_len_dict['lengths'], minVal, maxVal) + fragment_len_dict["lengths"] = getDensity( + fragment_len_dict["lengths"], minVal, maxVal + ) if read_len_dict: - read_len_dict['lengths'] = getDensity(read_len_dict['lengths'], minVal, maxVal) + read_len_dict["lengths"] = getDensity( + read_len_dict["lengths"], minVal, maxVal + ) else: if fragment_len_dict: - del fragment_len_dict['lengths'] + del fragment_len_dict["lengths"] if read_len_dict: - del read_len_dict['lengths'] + del read_len_dict["lengths"] return (fragment_len_dict, read_len_dict) @@ -230,11 +299,19 @@ def printTable(args, fragDict, readDict): """ of = open(args.table, "w") of.write("\tFrag. Sampled") - of.write("\tFrag. Len. Min.\tFrag. Len. 1st. Qu.\tFrag. Len. Mean\tFrag. Len. Median\tFrag. Len. 3rd Qu.\tFrag. Len. Max\tFrag. Len. Std.") - of.write("\tFrag. Med. Abs. Dev.\tFrag. Len. 10%\tFrag. Len. 20%\tFrag. Len. 30%\tFrag. Len. 40%\tFrag. Len. 60%\tFrag. Len. 70%\tFrag. Len. 80%\tFrag. Len. 90%\tFrag. Len. 99%") + of.write( + "\tFrag. Len. Min.\tFrag. Len. 1st. Qu.\tFrag. Len. Mean\tFrag. Len. Median\tFrag. Len. 3rd Qu.\tFrag. Len. Max\tFrag. Len. Std." + ) + of.write( + "\tFrag. Med. Abs. Dev.\tFrag. Len. 10%\tFrag. Len. 20%\tFrag. Len. 30%\tFrag. Len. 40%\tFrag. Len. 60%\tFrag. Len. 70%\tFrag. Len. 80%\tFrag. Len. 90%\tFrag. Len. 99%" + ) of.write("\tReads Sampled") - of.write("\tRead Len. Min.\tRead Len. 1st. Qu.\tRead Len. Mean\tRead Len. Median\tRead Len. 3rd Qu.\tRead Len. Max\tRead Len. Std.") - of.write("\tRead Med. Abs. Dev.\tRead Len. 10%\tRead Len. 20%\tRead Len. 30%\tRead Len. 40%\tRead Len. 60%\tRead Len. 70%\tRead Len. 80%\tRead Len. 90%\tRead Len. 99%\n") + of.write( + "\tRead Len. Min.\tRead Len. 1st. Qu.\tRead Len. Mean\tRead Len. Median\tRead Len. 3rd Qu.\tRead Len. Max\tRead Len. Std." + ) + of.write( + "\tRead Med. Abs. Dev.\tRead Len. 10%\tRead Len. 20%\tRead Len. 30%\tRead Len. 40%\tRead Len. 60%\tRead Len. 70%\tRead Len. 80%\tRead Len. 90%\tRead Len. 99%\n" + ) for idx, bam in enumerate(args.bamfiles): if args.samplesLabel and idx < len(args.samplesLabel): @@ -243,47 +320,63 @@ def printTable(args, fragDict, readDict): of.write(bam) if fragDict is not None and fragDict[bam] is not None: d = fragDict[bam] - of.write("\t{}".format(d['sample_size'])) - of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(d['min'], - d['qtile25'], - d['mean'], - d['median'], - d['qtile75'], - d['max'], - d['std'])) - of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(d['mad'], - d['qtile10'], - d['qtile20'], - d['qtile30'], - d['qtile40'], - d['qtile60'], - d['qtile70'], - d['qtile80'], - d['qtile90'], - d['qtile99'])) + of.write("\t{}".format(d["sample_size"])) + of.write( + "\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( + d["min"], + d["qtile25"], + d["mean"], + d["median"], + d["qtile75"], + d["max"], + d["std"], + ) + ) + of.write( + "\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( + d["mad"], + d["qtile10"], + d["qtile20"], + d["qtile30"], + d["qtile40"], + d["qtile60"], + d["qtile70"], + d["qtile80"], + d["qtile90"], + d["qtile99"], + ) + ) else: of.write("\t0") of.write("\t0\t0\t0\t0\t0\t0\t0") of.write("\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0") d = readDict[bam] - of.write("\t{}".format(d['sample_size'])) - of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(d['min'], - d['qtile25'], - d['mean'], - d['median'], - d['qtile75'], - d['max'], - d['std'])) - of.write("\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(d['mad'], - d['qtile10'], - d['qtile20'], - d['qtile30'], - d['qtile40'], - d['qtile60'], - d['qtile70'], - d['qtile80'], - d['qtile90'], - d['qtile99'])) + of.write("\t{}".format(d["sample_size"])) + of.write( + "\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( + d["min"], + d["qtile25"], + d["mean"], + d["median"], + d["qtile75"], + d["max"], + d["std"], + ) + ) + of.write( + "\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( + d["mad"], + d["qtile10"], + d["qtile20"], + d["qtile30"], + d["qtile40"], + d["qtile60"], + d["qtile70"], + d["qtile80"], + d["qtile90"], + d["qtile99"], + ) + ) of.close() @@ -322,37 +415,44 @@ def main(args=None): if args.maxFragmentLength > 0: maxVal = args.maxFragmentLength else: - maxVal = d['mean'] * 2 - - if args.plotFileFormat == 'plotly': - trace = go.Histogram(x=d['lengths'], - histnorm='probability', - opacity=0.5, - name=labels[i], - nbinsx=100, - xbins=dict(start=d['min'], end=maxVal)) + maxVal = d["mean"] * 2 + + if args.plotFileFormat == "plotly": + trace = go.Histogram( + x=d["lengths"], + histnorm="probability", + opacity=0.5, + name=labels[i], + nbinsx=100, + xbins=dict(start=d["min"], end=maxVal), + ) data.append(trace) else: - plt.bar(d['lengths'][1][:-1], height=d['lengths'][0], - width=d['lengths'][1][1:] - d['lengths'][1][:-1], - align='edge', log=args.logScale, - alpha=0.5, label=labels[i]) + plt.bar( + d["lengths"][1][:-1], + height=d["lengths"][0], + width=d["lengths"][1][1:] - d["lengths"][1][:-1], + align="edge", + log=args.logScale, + alpha=0.5, + label=labels[i], + ) i += 1 - if args.plotFileFormat == 'plotly': + if args.plotFileFormat == "plotly": fig = go.Figure() fig.add_traces(data) - fig['layout']['yaxis1'].update(title='Frequency') - fig['layout']['xaxis1'].update(title='Fragment Length') - fig['layout'].update(title=args.plotTitle) - fig['layout'].update(showlegend=True) + fig["layout"]["yaxis1"].update(title="Frequency") + fig["layout"]["xaxis1"].update(title="Fragment Length") + fig["layout"].update(title=args.plotTitle) + fig["layout"].update(showlegend=True) if args.logScale: - fig['layout']['yaxis1'].update(type='log') + fig["layout"]["yaxis1"].update(type="log") py.plot(fig, filename=args.histogram, auto_open=False) else: - plt.xlabel('Fragment Length') - plt.ylabel('Frequency') - plt.legend(loc='upper right') + plt.xlabel("Fragment Length") + plt.ylabel("Frequency") + plt.legend(loc="upper right") plt.title(args.plotTitle) plt.savefig(args.histogram, bbox_inches=0, format=args.plotFileFormat) plt.close() diff --git a/deeptools/bigwigCompare.py b/deeptools/bigwigCompare.py index e4d69da20..9463a7dba 100644 --- a/deeptools/bigwigCompare.py +++ b/deeptools/bigwigCompare.py @@ -19,79 +19,103 @@ def parse_arguments(args=None): parser = argparse.ArgumentParser( parents=[parentParser, outputParser, dbParser], formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='This tool compares two bigWig files based on the number ' - 'of mapped reads. To compare the bigWig files, the genome is ' - 'partitioned into bins of equal size, then the number of reads found ' - 'in each BAM file are counted per bin and finally a summary ' - 'value is reported. This value can be the ratio of the number of reads' - 'per bin, the log2 of the ratio, the sum or the difference.') + description="This tool compares two bigWig files based on the number " + "of mapped reads. To compare the bigWig files, the genome is " + "partitioned into bins of equal size, then the number of reads found " + "in each BAM file are counted per bin and finally a summary " + "value is reported. This value can be the ratio of the number of reads" + "per bin, the log2 of the ratio, the sum or the difference.", + ) # define the arguments - parser.add_argument('--bigwig1', '-b1', - metavar='Bigwig file', - help='Bigwig file 1. Usually the file for the ' - 'treatment.', - required=True) - - parser.add_argument('--bigwig2', '-b2', - metavar='Bigwig file', - help='Bigwig file 2. Usually the file for the ' - 'control.', - required=True) - - parser.add_argument('--scaleFactors', - help='Set this parameter to multipy the bigwig values ' - 'by a constant. The format is ' - 'scaleFactor1:scaleFactor2. ' - 'For example 0.7:1 to scale the first bigwig file ' - 'by 0.7 while not scaling the second bigwig file', - default=None, - required=False) - - parser.add_argument('--pseudocount', - help='A small number to avoid x/0. Only useful ' - 'together with --operation log2 or --operation ratio. ' - 'You can specify different values as pseudocounts for ' - 'the numerator and the denominator by providing two ' - 'values (the first value is used as the numerator ' - 'pseudocount and the second the denominator pseudocount). (Default: %(default)s)', - default=1, - nargs='+', - action=parserCommon.requiredLength(1, 2), - type=float, - required=False) - - parser.add_argument('--skipZeroOverZero', - help='Skip bins where BOTH BAM files lack coverage. ' - 'This is determined BEFORE any applicable pseudocount ' - 'is added.', - action='store_true') - - parser.add_argument('--operation', - help='The default is to output the log2ratio of the ' - 'two samples. The reciprocal ratio returns the ' - 'the negative of the inverse of the ratio ' - 'if the ratio is less than 0. The resulting ' - 'values are interpreted as negative fold changes. ' - 'Instead of performing a ' - 'computation using both files, the scaled signal can ' - 'alternatively be output for the first or second file using ' - 'the \'--operation first\' or \'--operation second\' (Default: %(default)s)', - default='log2', - choices=['log2', 'ratio', 'subtract', 'add', 'mean', - 'reciprocal_ratio', 'first', 'second'], - required=False) - - parser.add_argument('--skipNonCoveredRegions', '--skipNAs', - help='This parameter determines if non-covered regions (regions without a score) ' - 'in the bigWig files should be skipped. The default is to treat those ' - 'regions as having a value of zero. ' - 'The decision to skip non-covered regions ' - 'depends on the interpretation of the data. Non-covered regions ' - 'in a bigWig file may represent repetitive regions that should ' - 'be skipped. Alternatively, the interpretation of non-covered regions as ' - 'zeros may be wrong and this option should be used ', - action='store_true') + parser.add_argument( + "--bigwig1", + "-b1", + metavar="Bigwig file", + help="Bigwig file 1. Usually the file for the " "treatment.", + required=True, + ) + + parser.add_argument( + "--bigwig2", + "-b2", + metavar="Bigwig file", + help="Bigwig file 2. Usually the file for the " "control.", + required=True, + ) + + parser.add_argument( + "--scaleFactors", + help="Set this parameter to multipy the bigwig values " + "by a constant. The format is " + "scaleFactor1:scaleFactor2. " + "For example 0.7:1 to scale the first bigwig file " + "by 0.7 while not scaling the second bigwig file", + default=None, + required=False, + ) + + parser.add_argument( + "--pseudocount", + help="A small number to avoid x/0. Only useful " + "together with --operation log2 or --operation ratio. " + "You can specify different values as pseudocounts for " + "the numerator and the denominator by providing two " + "values (the first value is used as the numerator " + "pseudocount and the second the denominator pseudocount). (Default: %(default)s)", + default=1, + nargs="+", + action=parserCommon.requiredLength(1, 2), + type=float, + required=False, + ) + + parser.add_argument( + "--skipZeroOverZero", + help="Skip bins where BOTH BAM files lack coverage. " + "This is determined BEFORE any applicable pseudocount " + "is added.", + action="store_true", + ) + + parser.add_argument( + "--operation", + help="The default is to output the log2ratio of the " + "two samples. The reciprocal ratio returns the " + "the negative of the inverse of the ratio " + "if the ratio is less than 0. The resulting " + "values are interpreted as negative fold changes. " + "Instead of performing a " + "computation using both files, the scaled signal can " + "alternatively be output for the first or second file using " + "the '--operation first' or '--operation second' (Default: %(default)s)", + default="log2", + choices=[ + "log2", + "ratio", + "subtract", + "add", + "mean", + "reciprocal_ratio", + "first", + "second", + ], + required=False, + ) + + parser.add_argument( + "--skipNonCoveredRegions", + "--skipNAs", + help="This parameter determines if non-covered regions (regions without a score) " + "in the bigWig files should be skipped. The default is to treat those " + "regions as having a value of zero. " + "The decision to skip non-covered regions " + "depends on the interpretation of the data. Non-covered regions " + "in a bigWig file may represent repetitive regions that should " + "be skipped. Alternatively, the interpretation of non-covered regions as " + "zeros may be wrong and this option should be used ", + action="store_true", + ) return parser @@ -126,9 +150,11 @@ def main(args=None): # the getRatio function is called and receives # the function_args per each tile that is considered FUNC = getRatio - function_args = {'valueType': args.operation, - 'scaleFactors': scaleFactors, - 'pseudocount': args.pseudocount} + function_args = { + "valueType": args.operation, + "scaleFactors": scaleFactors, + "pseudocount": args.pseudocount, + } # Preload deepBlue files, which need to then be deleted deepBlueFiles = [] @@ -136,8 +162,14 @@ def main(args=None): if db.isDeepBlue(fname): deepBlueFiles.append([fname, idx]) if len(deepBlueFiles) > 0: - sys.stderr.write("Preloading the following deepBlue files: {}\n".format(",".join([x[0] for x in deepBlueFiles]))) - foo = db.deepBlue(deepBlueFiles[0][0], url=args.deepBlueURL, userKey=args.userKey) + sys.stderr.write( + "Preloading the following deepBlue files: {}\n".format( + ",".join([x[0] for x in deepBlueFiles]) + ) + ) + foo = db.deepBlue( + deepBlueFiles[0][0], url=args.deepBlueURL, userKey=args.userKey + ) regs = db.makeChromTiles(foo) for x in deepBlueFiles: x.extend([args, regs]) @@ -148,7 +180,7 @@ def main(args=None): res = list(map(db.preloadWrapper, deepBlueFiles)) # substitute the file names with the temp files - for (ftuple, r) in zip(deepBlueFiles, res): + for ftuple, r in zip(deepBlueFiles, res): if ftuple[1] == 0: args.bigwig1 = r else: @@ -157,10 +189,13 @@ def main(args=None): del regs writeBedGraph_bam_and_bw.writeBedGraph( - [(args.bigwig1, getType(args.bigwig1)), - (args.bigwig2, getType(args.bigwig2))], - args.outFileName, 0, FUNC, - function_args, tileSize=args.binSize, region=args.region, + [(args.bigwig1, getType(args.bigwig1)), (args.bigwig2, getType(args.bigwig2))], + args.outFileName, + 0, + FUNC, + function_args, + tileSize=args.binSize, + region=args.region, blackListFileName=args.blackListFileName, verbose=args.verbose, numberOfProcessors=args.numberOfProcessors, @@ -168,7 +203,8 @@ def main(args=None): format=args.outFileFormat, smoothLength=False, missingDataAsZero=not args.skipNonCoveredRegions, - extendPairedEnds=False) + extendPairedEnds=False, + ) # Clean up temporary bigWig files, if applicable if not args.deepBlueKeepTemp: diff --git a/deeptools/cm.py b/deeptools/cm.py index fcb7c20ff..2a561b6a0 100644 --- a/deeptools/cm.py +++ b/deeptools/cm.py @@ -289,7 +289,7 @@ [0.97912374, 0.90313207, 0.83979337], [0.979891, 0.90894778, 0.84827858], [0.98067764, 0.91476465, 0.85676611], - [0.98137749, 0.92061729, 0.86536915] + [0.98137749, 0.92061729, 0.86536915], ] @@ -549,7 +549,7 @@ [0.84857662, 0.9498573, 0.8776059], [0.8564431, 0.95309792, 0.88414253], [0.86429066, 0.95635719, 0.89067759], - [0.87218969, 0.95960708, 0.89725384] + [0.87218969, 0.95960708, 0.89725384], ] @@ -809,7 +809,7 @@ [0.67000176, 0.23511902, 0.24650278], [0.66693423, 0.22859879, 0.24124404], [0.6638441, 0.22201742, 0.2359961], - [0.66080672, 0.21526712, 0.23069468] + [0.66080672, 0.21526712, 0.23069468], ] @@ -1069,7 +1069,7 @@ [0.9932672, 0.79848979, 0.63231691], [0.99535958, 0.80926704, 0.64687278], [0.99740544, 0.82008078, 0.66150571], - [0.9992197, 0.83100723, 0.6764127] + [0.9992197, 0.83100723, 0.6764127], ] @@ -1077,7 +1077,6 @@ _names = ["rocket", "mako", "vlag", "icefire"] for _lut, _name in zip(_luts, _names): - _cmap = colors.ListedColormap(_lut, _name) locals()[_name] = _cmap diff --git a/deeptools/computeGCBias.py b/deeptools/computeGCBias.py index 0e2309161..612542c29 100755 --- a/deeptools/computeGCBias.py +++ b/deeptools/computeGCBias.py @@ -17,7 +17,7 @@ from deeptools import bamHandler debug = 0 -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def parse_arguments(args=None): @@ -26,14 +26,15 @@ def parse_arguments(args=None): parser = argparse.ArgumentParser( parents=[requiredArgs, parentParser], formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='Computes the GC-bias using Benjamini\'s method ' - '[Benjamini & Speed (2012). Nucleic Acids Research, 40(10). doi: 10.1093/nar/gks001]. ' - 'The GC-bias is visualized and the resulting table can be used to' - 'correct the bias with `correctGCBias`.', - usage='\n computeGCBias ' - '-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit -l 200 --GCbiasFrequenciesFile freq.txt [options]', - conflict_handler='resolve', - add_help=False) + description="Computes the GC-bias using Benjamini's method " + "[Benjamini & Speed (2012). Nucleic Acids Research, 40(10). doi: 10.1093/nar/gks001]. " + "The GC-bias is visualized and the resulting table can be used to" + "correct the bias with `correctGCBias`.", + usage="\n computeGCBias " + "-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit -l 200 --GCbiasFrequenciesFile freq.txt [options]", + conflict_handler="resolve", + add_help=False, + ) return parser @@ -41,96 +42,117 @@ def parse_arguments(args=None): def getRequiredArgs(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') - - required.add_argument('--bamfile', '-b', - metavar='bam file', - help='Sorted BAM file. ', - required=True) - - required.add_argument('--effectiveGenomeSize', - help='The effective genome size is the portion ' - 'of the genome that is mappable. Large fractions of ' - 'the genome are stretches of NNNN that should be ' - 'discarded. Also, if repetitive regions were not ' - 'included in the mapping of reads, the effective ' - 'genome size needs to be adjusted accordingly. ' - 'A table of values is available here: ' - 'http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .', - default=None, - type=int, - required=True) - - required.add_argument('--genome', '-g', - help='Genome in two bit format. Most genomes can be ' - 'found here: http://hgdownload.cse.ucsc.edu/gbdb/ ' - 'Search for the .2bit ending. Otherwise, fasta ' - 'files can be converted to 2bit using the UCSC ' - 'programm called faToTwoBit available for different ' - 'plattforms at ' - 'http://hgdownload.cse.ucsc.edu/admin/exe/', - metavar='2bit FILE', - required=True) - - required.add_argument('--GCbiasFrequenciesFile', '-freq', '-o', - help='Path to save the file containing ' - 'the observed and expected read frequencies per %%GC-' - 'content. This file is needed to run the ' - 'correctGCBias tool. This is a text file.', - type=argparse.FileType('w'), - metavar='FILE', - required=True) + required = parser.add_argument_group("Required arguments") + + required.add_argument( + "--bamfile", "-b", metavar="bam file", help="Sorted BAM file. ", required=True + ) + + required.add_argument( + "--effectiveGenomeSize", + help="The effective genome size is the portion " + "of the genome that is mappable. Large fractions of " + "the genome are stretches of NNNN that should be " + "discarded. Also, if repetitive regions were not " + "included in the mapping of reads, the effective " + "genome size needs to be adjusted accordingly. " + "A table of values is available here: " + "http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .", + default=None, + type=int, + required=True, + ) + + required.add_argument( + "--genome", + "-g", + help="Genome in two bit format. Most genomes can be " + "found here: http://hgdownload.cse.ucsc.edu/gbdb/ " + "Search for the .2bit ending. Otherwise, fasta " + "files can be converted to 2bit using the UCSC " + "programm called faToTwoBit available for different " + "plattforms at " + "http://hgdownload.cse.ucsc.edu/admin/exe/", + metavar="2bit FILE", + required=True, + ) + + required.add_argument( + "--GCbiasFrequenciesFile", + "-freq", + "-o", + help="Path to save the file containing " + "the observed and expected read frequencies per %%GC-" + "content. This file is needed to run the " + "correctGCBias tool. This is a text file.", + type=argparse.FileType("w"), + metavar="FILE", + required=True, + ) # define the optional arguments - optional = parser.add_argument_group('Optional arguments') - optional.add_argument('--fragmentLength', '-l', - help='Fragment length used for the sequencing. If ' - 'paired-end reads are used, the fragment length is ' - 'computed based from the bam file', - type=int) - - optional.add_argument("--help", "-h", action="help", - help="show this help message and exit") - - optional.add_argument('--sampleSize', - default=5e7, - help='Number of sampling points to be considered. (Default: %(default)s)', - type=int) - - optional.add_argument('--extraSampling', - help='BED file containing genomic regions for which ' - 'extra sampling is required because they are ' - 'underrepresented in the genome.', - type=argparse.FileType('r'), - metavar='BED file') - - plot = parser.add_argument_group('Diagnostic plot options') - - plot.add_argument('--biasPlot', - metavar='FILE NAME', - help='If given, a diagnostic image summarizing ' - 'the GC-bias will be saved.') - - plot.add_argument('--plotFileFormat', - metavar='', - help='image format type. If given, this ' - 'option overrides the ' - 'image format based on the plotFile ending. ' - 'The available options are: "png", ' - '"eps", "pdf", "plotly" and "svg"', - choices=['png', 'pdf', 'svg', 'eps', 'plotly']) - - plot.add_argument('--regionSize', - metavar='INT', - type=int, - default=300, - help='To plot the reads per %%GC over a region' - 'the size of the region is required. By default, ' - 'the bin size is set to 300 bases, which is close to the ' - 'standard fragment size for Illumina machines. However, ' - 'if the depth of sequencing is low, a larger bin size ' - 'will be required, otherwise many bins will not ' - 'overlap with any read (Default: %(default)s)') + optional = parser.add_argument_group("Optional arguments") + optional.add_argument( + "--fragmentLength", + "-l", + help="Fragment length used for the sequencing. If " + "paired-end reads are used, the fragment length is " + "computed based from the bam file", + type=int, + ) + + optional.add_argument( + "--help", "-h", action="help", help="show this help message and exit" + ) + + optional.add_argument( + "--sampleSize", + default=5e7, + help="Number of sampling points to be considered. (Default: %(default)s)", + type=int, + ) + + optional.add_argument( + "--extraSampling", + help="BED file containing genomic regions for which " + "extra sampling is required because they are " + "underrepresented in the genome.", + type=argparse.FileType("r"), + metavar="BED file", + ) + + plot = parser.add_argument_group("Diagnostic plot options") + + plot.add_argument( + "--biasPlot", + metavar="FILE NAME", + help="If given, a diagnostic image summarizing " "the GC-bias will be saved.", + ) + + plot.add_argument( + "--plotFileFormat", + metavar="", + help="image format type. If given, this " + "option overrides the " + "image format based on the plotFile ending. " + 'The available options are: "png", ' + '"eps", "pdf", "plotly" and "svg"', + choices=["png", "pdf", "svg", "eps", "plotly"], + ) + + plot.add_argument( + "--regionSize", + metavar="INT", + type=int, + default=300, + help="To plot the reads per %%GC over a region" + "the size of the region is required. By default, " + "the bin size is set to 300 bases, which is close to the " + "standard fragment size for Illumina machines. However, " + "if the depth of sequencing is low, a larger bin size " + "will be required, otherwise many bins will not " + "overlap with any read (Default: %(default)s)", + ) return parser @@ -148,13 +170,13 @@ def getPositionsToSample(chrom, start, end, stepSize): """ positions_to_sample = np.arange(start, end, stepSize) - if global_vars['filter_out']: - filter_out_tree = GTF(global_vars['filter_out']) + if global_vars["filter_out"]: + filter_out_tree = GTF(global_vars["filter_out"]) else: filter_out_tree = None - if global_vars['extra_sampling_file']: - extra_tree = GTF(global_vars['extra_sampling_file']) + if global_vars["extra_sampling_file"]: + extra_tree = GTF(global_vars["extra_sampling_file"]) else: extra_tree = None @@ -167,14 +189,17 @@ def getPositionsToSample(chrom, start, end, stepSize): if len(extra_match) > 0: for intval in extra_match: - positions_to_sample = np.append(positions_to_sample, - list(range(intval[0], intval[1], stepSize))) + positions_to_sample = np.append( + positions_to_sample, list(range(intval[0], intval[1], stepSize)) + ) # remove duplicates positions_to_sample = np.unique(np.sort(positions_to_sample)) if debug: - print("sampling increased to {} from {}".format( - len(positions_to_sample), - orig_len)) + print( + "sampling increased to {} from {}".format( + len(positions_to_sample), orig_len + ) + ) # skip regions that are filtered out if filter_out_tree: @@ -185,8 +210,10 @@ def getPositionsToSample(chrom, start, end, stepSize): if len(out_match) > 0: for intval in out_match: - positions_to_sample = \ - positions_to_sample[(positions_to_sample < intval[0]) | (positions_to_sample >= intval[1])] + positions_to_sample = positions_to_sample[ + (positions_to_sample < intval[0]) + | (positions_to_sample >= intval[1]) + ] return positions_to_sample @@ -194,21 +221,20 @@ def countReadsPerGC_wrapper(args): return countReadsPerGC_worker(*args) -def countReadsPerGC_worker(chromNameBam, - start, end, stepSize, regionSize, - chrNameBamToBit, verbose=False): +def countReadsPerGC_worker( + chromNameBam, start, end, stepSize, regionSize, chrNameBamToBit, verbose=False +): """given a genome region defined by (start, end), the GC content is quantified for regions of size regionSize that are contiguous """ chromNameBit = chrNameBamToBit[chromNameBam] - tbit = py2bit.open(global_vars['2bit']) - bam = bamHandler.openBam(global_vars['bam']) + tbit = py2bit.open(global_vars["2bit"]) + bam = bamHandler.openBam(global_vars["bam"]) c = 1 sub_reads_per_gc = [] - positions_to_sample = getPositionsToSample(chromNameBit, - start, end, stepSize) + positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) for index in range(len(positions_to_sample)): i = positions_to_sample[index] @@ -234,10 +260,10 @@ def tabulateGCcontent_wrapper(args): return tabulateGCcontent_worker(*args) -def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, - fragmentLength, - chrNameBamToBit, verbose=False): - r""" given genome regions, the GC content of the genome is tabulated for +def tabulateGCcontent_worker( + chromNameBam, start, end, stepSize, fragmentLength, chrNameBamToBit, verbose=False +): + r"""given genome regions, the GC content of the genome is tabulated for fragments of length 'fragmentLength' each 'stepSize' positions. >>> test = Tester() @@ -299,20 +325,20 @@ def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, # indicate the gc content. The values inside the # array are counts. Thus, if N_gc[10] = 3, that means # that 3 regions have a gc_content of 10. - subN_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') - subF_gc = np.zeros(fragmentLength['median'] + 1, dtype='int') + subN_gc = np.zeros(fragmentLength["median"] + 1, dtype="int") + subF_gc = np.zeros(fragmentLength["median"] + 1, dtype="int") - tbit = py2bit.open(global_vars['2bit']) - bam = bamHandler.openBam(global_vars['bam']) + tbit = py2bit.open(global_vars["2bit"]) + bam = bamHandler.openBam(global_vars["bam"]) peak = 0 startTime = time.time() if verbose: - print("[{:.3f}] computing positions to " - "sample".format(time.time() - startTime)) + print( + "[{:.3f}] computing positions to " "sample".format(time.time() - startTime) + ) - positions_to_sample = getPositionsToSample(chromNameBit, - start, end, stepSize) + positions_to_sample = getPositionsToSample(chromNameBit, start, end, stepSize) read_counts = [] # Optimize IO. @@ -331,16 +357,18 @@ def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, if verbose: print("[{:.3f}] caching reads".format(time.time() - startTime)) - counts = np.bincount([r.pos - start_pos - for r in bam.fetch(chromNameBam, start_pos, - end_pos + 1) - if not r.is_reverse and not r.is_unmapped and r.pos >= start_pos], - minlength=end_pos - start_pos + 2) + counts = np.bincount( + [ + r.pos - start_pos + for r in bam.fetch(chromNameBam, start_pos, end_pos + 1) + if not r.is_reverse and not r.is_unmapped and r.pos >= start_pos + ], + minlength=end_pos - start_pos + 2, + ) read_counts = counts[positions_to_sample - min(positions_to_sample)] if verbose: - print("[{:.3f}] finish caching reads.".format( - time.time() - startTime)) + print("[{:.3f}] finish caching reads.".format(time.time() - startTime)) countTime = time.time() @@ -348,11 +376,17 @@ def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, for index in range(len(positions_to_sample)): i = positions_to_sample[index] # stop if the end of the chromosome is reached - if i + fragmentLength['median'] > tbit.chroms(chromNameBit): + if i + fragmentLength["median"] > tbit.chroms(chromNameBit): break try: - gc = getGC_content(tbit, chromNameBit, int(i), int(i + fragmentLength['median']), fraction=False) + gc = getGC_content( + tbit, + chromNameBit, + int(i), + int(i + fragmentLength["median"]), + fraction=False, + ) except Exception as detail: if verbose: print(detail) @@ -362,12 +396,17 @@ def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, # count all reads at position 'i' if len(read_counts) == 0: # case when no cache was done - num_reads = len([x.pos for x in bam.fetch(chromNameBam, i, i + 1) - if x.is_reverse is False and x.pos == i]) + num_reads = len( + [ + x.pos + for x in bam.fetch(chromNameBam, i, i + 1) + if x.is_reverse is False and x.pos == i + ] + ) else: num_reads = read_counts[index] - if num_reads >= global_vars['max_reads']: + if num_reads >= global_vars["max_reads"]: peak += 1 continue @@ -375,27 +414,58 @@ def tabulateGCcontent_worker(chromNameBam, start, end, stepSize, if verbose: if index % 50000 == 0: endTime = time.time() - print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" % - (multiprocessing.current_process().name, - index, index / (endTime - countTime), - chromNameBit, start, end, stepSize)) + print( + "%s processing %d (%.1f per sec) @ %s:%s-%s %s" + % ( + multiprocessing.current_process().name, + index, + index / (endTime - countTime), + chromNameBit, + start, + end, + stepSize, + ) + ) c += 1 if verbose: endTime = time.time() - print("%s processing %d (%.1f per sec) @ %s:%s-%s %s" % - (multiprocessing.current_process().name, - index, index / (endTime - countTime), - chromNameBit, start, end, stepSize)) - print("%s total time %.1f @ %s:%s-%s %s" % (multiprocessing.current_process().name, - (endTime - startTime), chromNameBit, start, end, stepSize)) - - return(subN_gc, subF_gc) - - -def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize, - chromSizes, numberOfProcessors=None, verbose=False, - region=None): + print( + "%s processing %d (%.1f per sec) @ %s:%s-%s %s" + % ( + multiprocessing.current_process().name, + index, + index / (endTime - countTime), + chromNameBit, + start, + end, + stepSize, + ) + ) + print( + "%s total time %.1f @ %s:%s-%s %s" + % ( + multiprocessing.current_process().name, + (endTime - startTime), + chromNameBit, + start, + end, + stepSize, + ) + ) + + return (subN_gc, subF_gc) + + +def tabulateGCcontent( + fragmentLength, + chrNameBitToBam, + stepSize, + chromSizes, + numberOfProcessors=None, + verbose=False, + region=None, +): r""" Subdivides the genome or the reads into chunks to be analyzed in parallel using several processors. This codes handles the creation of @@ -420,17 +490,17 @@ def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize, global global_vars chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) - chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp'])) + chunkSize = int(min(2e6, 4e5 / global_vars["reads_per_bp"])) chromSizes = [(k, v) for k, v in chromSizes if k in list(chrNameBamToBit.keys())] - imap_res = mapReduce.mapReduce((stepSize, - fragmentLength, chrNameBamToBit, - verbose), - tabulateGCcontent_wrapper, - chromSizes, - genomeChunkLength=chunkSize, - numberOfProcessors=numberOfProcessors, - region=region) + imap_res = mapReduce.mapReduce( + (stepSize, fragmentLength, chrNameBamToBit, verbose), + tabulateGCcontent_wrapper, + chromSizes, + genomeChunkLength=chunkSize, + numberOfProcessors=numberOfProcessors, + region=region, + ) for subN_gc, subF_gc in imap_res: try: @@ -441,20 +511,31 @@ def tabulateGCcontent(fragmentLength, chrNameBitToBam, stepSize, N_gc = subN_gc if sum(F_gc) == 0: - sys.exit("No fragments included in the sampling! Consider decreasing (or maybe increasing) the --sampleSize parameter") + sys.exit( + "No fragments included in the sampling! Consider decreasing (or maybe increasing) the --sampleSize parameter" + ) scaling = float(sum(N_gc)) / float(sum(F_gc)) - R_gc = np.array([float(F_gc[x]) / N_gc[x] * scaling - if N_gc[x] and F_gc[x] > 0 else 1 - for x in range(len(F_gc))]) + R_gc = np.array( + [ + float(F_gc[x]) / N_gc[x] * scaling if N_gc[x] and F_gc[x] > 0 else 1 + for x in range(len(F_gc)) + ] + ) data = np.transpose(np.vstack((F_gc, N_gc, R_gc))) return data -def countReadsPerGC(regionSize, chrNameBitToBam, stepSize, - chromSizes, numberOfProcessors=None, verbose=False, - region=None): +def countReadsPerGC( + regionSize, + chrNameBitToBam, + stepSize, + chromSizes, + numberOfProcessors=None, + verbose=False, + region=None, +): r""" Computes for a region of size regionSize, the GC of the region and the number of reads that overlap it. @@ -471,16 +552,16 @@ def countReadsPerGC(regionSize, chrNameBitToBam, stepSize, global global_vars chrNameBamToBit = dict([(v, k) for k, v in chrNameBitToBam.items()]) - chunkSize = int(min(2e6, 4e5 / global_vars['reads_per_bp'])) + chunkSize = int(min(2e6, 4e5 / global_vars["reads_per_bp"])) - imap_res = mapReduce.mapReduce((stepSize, - regionSize, chrNameBamToBit, - verbose), - countReadsPerGC_wrapper, - chromSizes, - genomeChunkLength=chunkSize, - numberOfProcessors=numberOfProcessors, - region=region) + imap_res = mapReduce.mapReduce( + (stepSize, regionSize, chrNameBamToBit, verbose), + countReadsPerGC_wrapper, + chromSizes, + genomeChunkLength=chunkSize, + numberOfProcessors=numberOfProcessors, + region=region, + ) reads_per_gc = [] for sub_reads_per_gc in imap_res: @@ -506,7 +587,7 @@ def smooth(x, window_len=3): if i < half_width or i + half_width + 1 > len(x): continue else: - y[i] = np.mean(x[i - half_width:i + half_width + 1]) + y[i] = np.mean(x[i - half_width : i + half_width + 1]) # clip low values, this avoid problems with zeros return y @@ -538,14 +619,44 @@ def plotlyGCbias(file_name, frequencies, reads_per_gc, region_size): import matplotlib.cbook as cbook fig = go.Figure() - fig['layout']['xaxis1'] = dict(domain=[0.0, 1.0], anchor="y1", title="GC fraction") - fig['layout']['yaxis1'] = dict(domain=[0.55, 1.0], anchor="x1", title="Number of reads") - fig['layout']['xaxis2'] = dict(domain=[0.0, 1.0], anchor="y2", title="GC fraction", range=[0.2, 0.7]) - fig['layout']['yaxis2'] = dict(domain=[0.0, 0.45], anchor="x2", title="log2(observed/expected)") + fig["layout"]["xaxis1"] = dict(domain=[0.0, 1.0], anchor="y1", title="GC fraction") + fig["layout"]["yaxis1"] = dict( + domain=[0.55, 1.0], anchor="x1", title="Number of reads" + ) + fig["layout"]["xaxis2"] = dict( + domain=[0.0, 1.0], anchor="y2", title="GC fraction", range=[0.2, 0.7] + ) + fig["layout"]["yaxis2"] = dict( + domain=[0.0, 0.45], anchor="x2", title="log2(observed/expected)" + ) text = "reads per {} base region".format(region_size) - annos = [{'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': text, 'y': 1.0, 'x': 0.5, 'font': {'size': 16}, 'showarrow': False}] + annos = [ + { + "yanchor": "bottom", + "xref": "paper", + "xanchor": "center", + "yref": "paper", + "text": text, + "y": 1.0, + "x": 0.5, + "font": {"size": 16}, + "showarrow": False, + } + ] text = "normalized observed/expected read counts" - annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': text, 'y': 0.5, 'x': 0.5, 'font': {'size': 16}, 'showarrow': False}) + annos.append( + { + "yanchor": "bottom", + "xref": "paper", + "xanchor": "center", + "yref": "paper", + "text": text, + "y": 0.5, + "x": 0.5, + "font": {"size": 16}, + "showarrow": False, + } + ) # prepare data for boxplot reads, GC = reads_per_gc.T @@ -558,29 +669,58 @@ def plotlyGCbias(file_name, frequencies, reads_per_gc, region_size): bins = [] for b in reads_per_gc: s = cbook.boxplot_stats(b)[0] - bins.append([s['whislo'], s['q1'], s['q1'], s['med'], s['med'], s['med'], s['q3'], s['q3'], s['whishi']]) + bins.append( + [ + s["whislo"], + s["q1"], + s["q1"], + s["med"], + s["med"], + s["med"], + s["q3"], + s["q3"], + s["whishi"], + ] + ) data = [] # top plot for x, y in zip(bin_labels, bins): - trace = go.Box(x=x, y=y, xaxis='x1', yaxis='y1', boxpoints='outliers', showlegend=False, name="{}".format(x), line=dict(color='rgb(107,174,214)')) + trace = go.Box( + x=x, + y=y, + xaxis="x1", + yaxis="y1", + boxpoints="outliers", + showlegend=False, + name="{}".format(x), + line=dict(color="rgb(107,174,214)"), + ) data.append(trace) # bottom plot x = np.linspace(0, 1, frequencies.shape[0]) - trace = go.Scatter(x=x, y=np.log2(frequencies[:, 2]), xaxis='x2', yaxis='y2', showlegend=False, line=dict(color='rgb(107,174,214)')) + trace = go.Scatter( + x=x, + y=np.log2(frequencies[:, 2]), + xaxis="x2", + yaxis="y2", + showlegend=False, + line=dict(color="rgb(107,174,214)"), + ) data.append(trace) fig.add_traces(data) - fig['layout']['annotations'] = annos + fig["layout"]["annotations"] = annos py.plot(fig, filename=file_name, auto_open=False) def plotGCbias(file_name, frequencies, reads_per_gc, region_size, image_format=None): import matplotlib - matplotlib.use('Agg') - matplotlib.rcParams['pdf.fonttype'] = 42 - matplotlib.rcParams['svg.fonttype'] = 'none' + + matplotlib.use("Agg") + matplotlib.rcParams["pdf.fonttype"] = 42 + matplotlib.rcParams["svg.fonttype"] = "none" import matplotlib.pyplot as plt # prepare data for boxplot @@ -593,21 +733,20 @@ def plotGCbias(file_name, frequencies, reads_per_gc, region_size, image_format=N title = "reads per regions of {} bp".format(region_size) fig = plt.figure(figsize=(6, 8)) ax1 = fig.add_subplot(211, title=title) - ax2 = fig.add_subplot(212, - title='normalized observed/expected read counts') + ax2 = fig.add_subplot(212, title="normalized observed/expected read counts") # make boxplot bp = ax1.boxplot(reads_per_gc, notch=0, patch_artist=True) - plt.setp(bp['boxes'], color='black', facecolor='LightGreen') - plt.setp(bp['medians'], color='black') - plt.setp(bp['whiskers'], color='black', linestyle='dashed') - plt.setp(bp['fliers'], marker='None') + plt.setp(bp["boxes"], color="black", facecolor="LightGreen") + plt.setp(bp["medians"], color="black") + plt.setp(bp["whiskers"], color="black", linestyle="dashed") + plt.setp(bp["fliers"], marker="None") # get the whisker that spands the most - y_max = np.nanmax([x.get_data()[1][1] for x in bp['whiskers']]) + y_max = np.nanmax([x.get_data()[1][1] for x in bp["whiskers"]]) ax1.set_ylim(0 - (y_max * 0.05), y_max * 1.05) - ax1.set_ylabel('Number of reads') - ax1.set_xlabel('GC fraction') + ax1.set_ylabel("Number of reads") + ax1.set_xlabel("GC fraction") xticks = [idx for idx, x in enumerate(bin_labels) if int(x * 100) % 10 == 0] @@ -616,12 +755,12 @@ def plotGCbias(file_name, frequencies, reads_per_gc, region_size, image_format=N x = np.linspace(0, 1, frequencies.shape[0]) y = np.log2(frequencies[:, 2]) - ax2.plot(x, y, color='#8c96f0') - ax2.set_xlabel('GC fraction') - ax2.set_ylabel('log2ratio observed/expected') + ax2.plot(x, y, color="#8c96f0") + ax2.set_xlabel("GC fraction") + ax2.set_ylabel("log2ratio observed/expected") ax2.set_xlim(0.2, 0.7) - y_max = max(y[np.where(x >= 0.2)[0][0]:np.where(x <= 0.7)[0][-1] + 1]) - y_min = min(y[np.where(x >= 0.2)[0][0]:np.where(x <= 0.7)[0][-1] + 1]) + y_max = max(y[np.where(x >= 0.2)[0][0] : np.where(x <= 0.7)[0][-1] + 1]) + y_min = min(y[np.where(x >= 0.2)[0][0] : np.where(x <= 0.7)[0][-1] + 1]) if y_max > 0: y_max *= 1.1 else: @@ -632,7 +771,7 @@ def plotGCbias(file_name, frequencies, reads_per_gc, region_size, image_format=N y_min *= 0.9 ax2.set_ylim(y_min, y_max) plt.tight_layout() - plt.savefig(file_name, bbox_inches='tight', dpi=100, format=image_format) + plt.savefig(file_name, bbox_inches="tight", dpi=100, format=image_format) plt.close() @@ -647,42 +786,49 @@ def main(args=None): global global_vars global_vars = {} - global_vars['2bit'] = args.genome - global_vars['bam'] = args.bamfile - global_vars['filter_out'] = args.blackListFileName - global_vars['extra_sampling_file'] = extra_sampling_file + global_vars["2bit"] = args.genome + global_vars["bam"] = args.bamfile + global_vars["filter_out"] = args.blackListFileName + global_vars["extra_sampling_file"] = extra_sampling_file - tbit = py2bit.open(global_vars['2bit']) - bam, mapped, unmapped, stats = bamHandler.openBam(global_vars['bam'], returnStats=True, nThreads=args.numberOfProcessors) + tbit = py2bit.open(global_vars["2bit"]) + bam, mapped, unmapped, stats = bamHandler.openBam( + global_vars["bam"], returnStats=True, nThreads=args.numberOfProcessors + ) if args.fragmentLength: - fragment_len_dict = \ - {'median': args.fragmentLength} + fragment_len_dict = {"median": args.fragmentLength} else: - fragment_len_dict, __ = \ - get_read_and_fragment_length(args.bamfile, None, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose) + fragment_len_dict, __ = get_read_and_fragment_length( + args.bamfile, + None, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + ) if not fragment_len_dict: - print("\nPlease provide the fragment length used for the " - "sample preparation.\n") + print( + "\nPlease provide the fragment length used for the " + "sample preparation.\n" + ) exit(1) - fragment_len_dict = {'median': int(fragment_len_dict['median'])} + fragment_len_dict = {"median": int(fragment_len_dict["median"])} chrNameBitToBam = tbitToBamChrName(list(tbit.chroms().keys()), bam.references) - global_vars['genome_size'] = sum(tbit.chroms().values()) - global_vars['total_reads'] = mapped - global_vars['reads_per_bp'] = \ - float(global_vars['total_reads']) / args.effectiveGenomeSize + global_vars["genome_size"] = sum(tbit.chroms().values()) + global_vars["total_reads"] = mapped + global_vars["reads_per_bp"] = ( + float(global_vars["total_reads"]) / args.effectiveGenomeSize + ) confidence_p_value = float(1) / args.sampleSize # chromSizes: list of tuples - chromSizes = [(bam.references[i], bam.lengths[i]) - for i in range(len(bam.references))] + chromSizes = [ + (bam.references[i], bam.lengths[i]) for i in range(len(bam.references)) + ] chromSizes = [x for x in chromSizes if x[0] in tbit.chroms()] # use poisson distribution to identify peaks that should be discarted. @@ -692,107 +838,126 @@ def main(args=None): # empirically, a value of at least 4 times as big as the # reads_per_bp was found. # Similarly for the min value, I divide by 4. - global_vars['max_reads'] = poisson(4 * global_vars['reads_per_bp'] * fragment_len_dict['median']).isf(confidence_p_value) + global_vars["max_reads"] = poisson( + 4 * global_vars["reads_per_bp"] * fragment_len_dict["median"] + ).isf(confidence_p_value) # this may be of not use, unless the depth of sequencing is really high # as this value is close to 0 - global_vars['min_reads'] = poisson(0.25 * global_vars['reads_per_bp'] * fragment_len_dict['median']).ppf(confidence_p_value) + global_vars["min_reads"] = poisson( + 0.25 * global_vars["reads_per_bp"] * fragment_len_dict["median"] + ).ppf(confidence_p_value) for key in global_vars: print("{}: {}".format(key, global_vars[key])) print("computing frequencies") # the GC of the genome is sampled each stepSize bp. - stepSize = max(int(global_vars['genome_size'] / args.sampleSize), 1) + stepSize = max(int(global_vars["genome_size"] / args.sampleSize), 1) print("stepSize: {}".format(stepSize)) - data = tabulateGCcontent(fragment_len_dict, - chrNameBitToBam, stepSize, - chromSizes, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose, - region=args.region) + data = tabulateGCcontent( + fragment_len_dict, + chrNameBitToBam, + stepSize, + chromSizes, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + region=args.region, + ) np.savetxt(args.GCbiasFrequenciesFile.name, data) if args.biasPlot: - reads_per_gc = countReadsPerGC(args.regionSize, - chrNameBitToBam, stepSize * 10, - chromSizes, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose, - region=args.region) + reads_per_gc = countReadsPerGC( + args.regionSize, + chrNameBitToBam, + stepSize * 10, + chromSizes, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + region=args.region, + ) if args.plotFileFormat == "plotly": plotlyGCbias(args.biasPlot, data, reads_per_gc, args.regionSize) else: - plotGCbias(args.biasPlot, data, reads_per_gc, args.regionSize, image_format=args.plotFileFormat) + plotGCbias( + args.biasPlot, + data, + reads_per_gc, + args.regionSize, + image_format=args.plotFileFormat, + ) -class Tester(): +class Tester: def __init__(self): import os + self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/" self.tbitFile = self.root + "sequence.2bit" self.bamFile = self.root + "test.bam" self.mappability = self.root + "mappability.bw" - self.chrNameBam = '2L' - self.chrNameBit = 'chr2L' - bam, mapped, unmapped, stats = bamHandler.openBam(self.bamFile, returnStats=True) + self.chrNameBam = "2L" + self.chrNameBit = "chr2L" + bam, mapped, unmapped, stats = bamHandler.openBam( + self.bamFile, returnStats=True + ) tbit = py2bit.open(self.tbitFile) global debug debug = 0 global global_vars - global_vars = {'2bit': self.tbitFile, - 'bam': self.bamFile, - 'filter_out': None, - 'mappability': self.mappability, - 'extra_sampling_file': None, - 'max_reads': 5, - 'min_reads': 0, - 'min_reads': 0, - 'reads_per_bp': 0.3, - 'total_reads': mapped, - 'genome_size': sum(tbit.chroms().values()) - } + global_vars = { + "2bit": self.tbitFile, + "bam": self.bamFile, + "filter_out": None, + "mappability": self.mappability, + "extra_sampling_file": None, + "max_reads": 5, + "min_reads": 0, + "min_reads": 0, + "reads_per_bp": 0.3, + "total_reads": mapped, + "genome_size": sum(tbit.chroms().values()), + } def testTabulateGCcontentWorker(self): stepSize = 2 - fragmentLength = {'min': 1, 'median': 3, 'max': 5} + fragmentLength = {"min": 1, "median": 3, "max": 5} start = 0 end = 20 - chrNameBam2bit = {'2L': 'chr2L'} - return (self.chrNameBam, - start, end, stepSize, fragmentLength, chrNameBam2bit) + chrNameBam2bit = {"2L": "chr2L"} + return (self.chrNameBam, start, end, stepSize, fragmentLength, chrNameBam2bit) def set_filter_out_file(self): global global_vars - global_vars['filter_out'] = self.root + "filter_out.bed" + global_vars["filter_out"] = self.root + "filter_out.bed" def unset_filter_out_file(self): global global_vars - global_vars['filter_out'] = None + global_vars["filter_out"] = None def set_extra_sampling_file(self): global global_vars - global_vars['extra_sampling_file'] = self.root + "extra_sampling.bed" + global_vars["extra_sampling_file"] = self.root + "extra_sampling.bed" def testTabulateGCcontent(self): - fragmentLength = {'median': 10} - chrNameBitToBam = {'chr2L': '2L'} + fragmentLength = {"median": 10} + chrNameBitToBam = {"chr2L": "2L"} stepSize = 1 - bam = bamHandler.openBam(global_vars['bam']) - chromSizes = [(bam.references[i], bam.lengths[i]) - for i in range(len(bam.references))] - return (fragmentLength, - chrNameBitToBam, stepSize, chromSizes, 1) + bam = bamHandler.openBam(global_vars["bam"]) + chromSizes = [ + (bam.references[i], bam.lengths[i]) for i in range(len(bam.references)) + ] + return (fragmentLength, chrNameBitToBam, stepSize, chromSizes, 1) def testCountReadsPerGC(self): regionSize = 300 - chrNameBitToBam = {'chr2L': '2L'} + chrNameBitToBam = {"chr2L": "2L"} stepSize = 1 - bam = bamHandler.openBam(global_vars['bam']) - chromSizes = [(bam.references[i], bam.lengths[i]) - for i in range(len(bam.references))] - return (regionSize, - chrNameBitToBam, stepSize, chromSizes, 1) + bam = bamHandler.openBam(global_vars["bam"]) + chromSizes = [ + (bam.references[i], bam.lengths[i]) for i in range(len(bam.references)) + ] + return (regionSize, chrNameBitToBam, stepSize, chromSizes, 1) if __name__ == "__main__": diff --git a/deeptools/computeMatrix.py b/deeptools/computeMatrix.py index 345bbc79a..e52abc460 100644 --- a/deeptools/computeMatrix.py +++ b/deeptools/computeMatrix.py @@ -15,10 +15,9 @@ def parse_arguments(args=None): - parser = \ - argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description=""" + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=""" This tool calculates scores per genome regions and prepares an intermediate file that can be used with ``plotHeatmap`` and ``plotProfiles``. Typically, the genome regions are genes, but any other regions defined in a BED file can be used. @@ -33,320 +32,409 @@ def parse_arguments(args=None): $ computeMatrix scale-regions --help """, - epilog='An example usage is:\n computeMatrix reference-point -S ' - ' -R -b 1000\n \n') + epilog="An example usage is:\n computeMatrix reference-point -S " + " -R -b 1000\n \n", + ) - parser.add_argument('--version', action='version', - version='%(prog)s {}'.format(__version__)) + parser.add_argument( + "--version", action="version", version="%(prog)s {}".format(__version__) + ) - subparsers = parser.add_subparsers( - title='Commands', - dest='command', - metavar='') + subparsers = parser.add_subparsers(title="Commands", dest="command", metavar="") dbParser = parserCommon.deepBlueOptionalArgs() # scale-regions mode options subparsers.add_parser( - 'scale-regions', + "scale-regions", formatter_class=argparse.ArgumentDefaultsHelpFormatter, - parents=[computeMatrixRequiredArgs(), - computeMatrixOutputArgs(), - computeMatrixOptArgs(case='scale-regions'), - parserCommon.gtf_options(), - dbParser], + parents=[ + computeMatrixRequiredArgs(), + computeMatrixOutputArgs(), + computeMatrixOptArgs(case="scale-regions"), + parserCommon.gtf_options(), + dbParser, + ], help="In the scale-regions mode, all regions in the BED file are " "stretched or shrunken to the length (in bases) indicated by the user.", - usage='An example usage is:\n computeMatrix scale-regions -S ' - ' -R -b 1000\n\n') + usage="An example usage is:\n computeMatrix scale-regions -S " + " -R -b 1000\n\n", + ) # reference point arguments subparsers.add_parser( - 'reference-point', + "reference-point", formatter_class=argparse.ArgumentDefaultsHelpFormatter, - parents=[computeMatrixRequiredArgs(), - computeMatrixOutputArgs(), - computeMatrixOptArgs(case='reference-point'), - parserCommon.gtf_options(), - dbParser], + parents=[ + computeMatrixRequiredArgs(), + computeMatrixOutputArgs(), + computeMatrixOptArgs(case="reference-point"), + parserCommon.gtf_options(), + dbParser, + ], help="Reference-point refers to a position within a BED region " "(e.g., the starting point). In this mode, only those genomic" "positions before (upstream) and/or after (downstream) of the " "reference point will be plotted.", - usage='An example usage is:\n computeMatrix reference-point -S ' - ' -R -a 3000 -b 3000\n\n') + usage="An example usage is:\n computeMatrix reference-point -S " + " -R -a 3000 -b 3000\n\n", + ) return parser def computeMatrixRequiredArgs(args=None): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') - required.add_argument('--regionsFileName', '-R', - metavar='File', - help='File name or names, in BED or GTF format, containing ' - 'the regions to plot. If multiple bed files are given, each one is considered a ' - 'group that can be plotted separately. Also, adding a "#" symbol in the bed file ' - 'causes all the regions until the previous "#" to be considered one group.', - nargs='+', - required=True) - required.add_argument('--scoreFileName', '-S', - help='bigWig file(s) containing ' - 'the scores to be plotted. Multiple files should be separated by spaced. BigWig ' - 'files can be obtained by using the bamCoverage ' - 'or bamCompare tools. More information about ' - 'the bigWig file format can be found at ' - 'http://genome.ucsc.edu/goldenPath/help/bigWig.html ', - metavar='File', - nargs='+', - required=True) + required = parser.add_argument_group("Required arguments") + required.add_argument( + "--regionsFileName", + "-R", + metavar="File", + help="File name or names, in BED or GTF format, containing " + "the regions to plot. If multiple bed files are given, each one is considered a " + 'group that can be plotted separately. Also, adding a "#" symbol in the bed file ' + 'causes all the regions until the previous "#" to be considered one group.', + nargs="+", + required=True, + ) + required.add_argument( + "--scoreFileName", + "-S", + help="bigWig file(s) containing " + "the scores to be plotted. Multiple files should be separated by spaced. BigWig " + "files can be obtained by using the bamCoverage " + "or bamCompare tools. More information about " + "the bigWig file format can be found at " + "http://genome.ucsc.edu/goldenPath/help/bigWig.html ", + metavar="File", + nargs="+", + required=True, + ) return parser def computeMatrixOutputArgs(args=None): parser = argparse.ArgumentParser(add_help=False) - output = parser.add_argument_group('Output options') - output.add_argument('--outFileName', '-out', '-o', - help='File name to save the gzipped matrix file ' - 'needed by the "plotHeatmap" and "plotProfile" tools.', - type=writableFile, - required=True) - - output.add_argument('--outFileNameMatrix', - help='If this option is given, then the matrix ' - 'of values underlying the heatmap will be saved ' - 'using the indicated name, e.g. IndividualValues.tab.' - 'This matrix can easily be loaded into R or ' - 'other programs.', - metavar='FILE', - type=writableFile) - output.add_argument('--outFileSortedRegions', - help='File name in which the regions are saved ' - 'after skiping zeros or min/max threshold values. The ' - 'order of the regions in the file follows the sorting ' - 'order selected. This is useful, for example, to ' - 'generate other heatmaps keeping the sorting of the ' - 'first heatmap. Example: Heatmap1sortedRegions.bed', - metavar='BED file', - type=argparse.FileType('w')) + output = parser.add_argument_group("Output options") + output.add_argument( + "--outFileName", + "-out", + "-o", + help="File name to save the gzipped matrix file " + 'needed by the "plotHeatmap" and "plotProfile" tools.', + type=writableFile, + required=True, + ) + + output.add_argument( + "--outFileNameMatrix", + help="If this option is given, then the matrix " + "of values underlying the heatmap will be saved " + "using the indicated name, e.g. IndividualValues.tab." + "This matrix can easily be loaded into R or " + "other programs.", + metavar="FILE", + type=writableFile, + ) + output.add_argument( + "--outFileSortedRegions", + help="File name in which the regions are saved " + "after skiping zeros or min/max threshold values. The " + "order of the regions in the file follows the sorting " + "order selected. This is useful, for example, to " + "generate other heatmaps keeping the sorting of the " + "first heatmap. Example: Heatmap1sortedRegions.bed", + metavar="BED file", + type=argparse.FileType("w"), + ) return parser -def computeMatrixOptArgs(case=['scale-regions', 'reference-point'][0]): - +def computeMatrixOptArgs(case=["scale-regions", "reference-point"][0]): parser = argparse.ArgumentParser(add_help=False) - optional = parser.add_argument_group('Optional arguments') - optional.add_argument('--version', action='version', - version='%(prog)s {}'.format(__version__)) - - if case == 'scale-regions': - optional.add_argument('--regionBodyLength', '-m', - default=1000, - type=int, - help='Distance in bases to which all regions will ' - 'be fit. (Default: %(default)s)') - optional.add_argument('--startLabel', - default='TSS', - help='Label shown in the plot for the start of ' - 'the region. Default is TSS (transcription ' - 'start site), but could be changed to anything, ' - 'e.g. "peak start". Note that this is only ' - 'useful if you plan to plot the results yourself ' - 'and not, for example, with plotHeatmap, which ' - 'will override this. (Default: %(default)s)') - optional.add_argument('--endLabel', - default='TES', - help='Label shown in the plot for the region ' - 'end. Default is TES (transcription end site). ' - 'See the --startLabel option for more ' - 'information. (Default: %(default)s) ') - optional.add_argument('--beforeRegionStartLength', '-b', '--upstream', - default=0, - type=int, - help='Distance upstream of the start site of ' - 'the regions defined in the region file. If the ' - 'regions are genes, this would be the distance ' - 'upstream of the transcription start site. (Default: %(default)s)') - optional.add_argument('--afterRegionStartLength', '-a', '--downstream', - default=0, - type=int, - help='Distance downstream of the end site ' - 'of the given regions. If the ' - 'regions are genes, this would be the distance ' - 'downstream of the transcription end site. (Default: %(default)s)') - optional.add_argument("--unscaled5prime", - default=0, - type=int, - help='Number of bases at the 5-prime end of the ' - 'region to exclude from scaling. By default, ' - 'each region is scaled to a given length (see the --regionBodyLength option). In some cases it is useful to look at unscaled signals around region boundaries, so this setting specifies the number of unscaled bases on the 5-prime end of each boundary. (Default: %(default)s)') - optional.add_argument("--unscaled3prime", - default=0, - type=int, - help='Like --unscaled5prime, but for the 3-prime ' - 'end. (Default: %(default)s)') - - elif case == 'reference-point': - optional.add_argument('--referencePoint', - default='TSS', - choices=['TSS', 'TES', 'center'], - help='The reference point for the plotting ' - 'could be either the region start (TSS), the ' - 'region end (TES) or the center of the region. ' - 'Note that regardless of what you specify, ' - 'plotHeatmap/plotProfile will default to using "TSS" as the ' - 'label. (Default: %(default)s)') + optional = parser.add_argument_group("Optional arguments") + optional.add_argument( + "--version", action="version", version="%(prog)s {}".format(__version__) + ) + + if case == "scale-regions": + optional.add_argument( + "--regionBodyLength", + "-m", + default=1000, + type=int, + help="Distance in bases to which all regions will " + "be fit. (Default: %(default)s)", + ) + optional.add_argument( + "--startLabel", + default="TSS", + help="Label shown in the plot for the start of " + "the region. Default is TSS (transcription " + "start site), but could be changed to anything, " + 'e.g. "peak start". Note that this is only ' + "useful if you plan to plot the results yourself " + "and not, for example, with plotHeatmap, which " + "will override this. (Default: %(default)s)", + ) + optional.add_argument( + "--endLabel", + default="TES", + help="Label shown in the plot for the region " + "end. Default is TES (transcription end site). " + "See the --startLabel option for more " + "information. (Default: %(default)s) ", + ) + optional.add_argument( + "--beforeRegionStartLength", + "-b", + "--upstream", + default=0, + type=int, + help="Distance upstream of the start site of " + "the regions defined in the region file. If the " + "regions are genes, this would be the distance " + "upstream of the transcription start site. (Default: %(default)s)", + ) + optional.add_argument( + "--afterRegionStartLength", + "-a", + "--downstream", + default=0, + type=int, + help="Distance downstream of the end site " + "of the given regions. If the " + "regions are genes, this would be the distance " + "downstream of the transcription end site. (Default: %(default)s)", + ) + optional.add_argument( + "--unscaled5prime", + default=0, + type=int, + help="Number of bases at the 5-prime end of the " + "region to exclude from scaling. By default, " + "each region is scaled to a given length (see the --regionBodyLength option). In some cases it is useful to look at unscaled signals around region boundaries, so this setting specifies the number of unscaled bases on the 5-prime end of each boundary. (Default: %(default)s)", + ) + optional.add_argument( + "--unscaled3prime", + default=0, + type=int, + help="Like --unscaled5prime, but for the 3-prime " + "end. (Default: %(default)s)", + ) + + elif case == "reference-point": + optional.add_argument( + "--referencePoint", + default="TSS", + choices=["TSS", "TES", "center"], + help="The reference point for the plotting " + "could be either the region start (TSS), the " + "region end (TES) or the center of the region. " + "Note that regardless of what you specify, " + 'plotHeatmap/plotProfile will default to using "TSS" as the ' + "label. (Default: %(default)s)", + ) # set region body length to zero for reference point mode - optional.add_argument('--regionBodyLength', help=argparse.SUPPRESS, - default=0, type=int) - optional.add_argument('--unscaled5prime', default=0, type=int, help=argparse.SUPPRESS) - optional.add_argument('--unscaled3prime', default=0, type=int, help=argparse.SUPPRESS) - optional.add_argument('--beforeRegionStartLength', '-b', '--upstream', - default=500, - type=int, - metavar='INT bp', - help='Distance upstream of the reference-point ' - 'selected. (Default: %(default)s)') - optional.add_argument('--afterRegionStartLength', '-a', '--downstream', - default=1500, - metavar='INT bp', - type=int, - help='Distance downstream of the ' - 'reference-point selected. (Default: %(default)s)') - optional.add_argument('--nanAfterEnd', - action='store_true', - help='If set, any values after the region end ' - 'are discarded. This is useful to visualize ' - 'the region end when not using the ' - 'scale-regions mode and when the reference-' - 'point is set to the TSS.') - - optional.add_argument('--binSize', '-bs', - help='Length, in bases, of the non-overlapping ' - 'bins for averaging the score over the ' - 'regions length. (Default: %(default)s)', - type=int, - default=10) - - optional.add_argument('--sortRegions', - help='Whether the output file should present the ' - 'regions sorted. The default is to not sort the regions. ' - 'Note that this is only useful if you plan to plot ' - 'the results yourself and not, for example, with ' - 'plotHeatmap, which will override this. Note also that ' - 'unsorted output will be in whatever order the regions ' - 'happen to be processed in and not match the order in ' - 'the input files. If you require the output order to ' - 'match that of the input regions, then either specify ' - '"keep" or use computeMatrixOperations to resort the ' - 'results file. (Default: %(default)s)', - choices=["descend", "ascend", "no", "keep"], - default='keep') - - optional.add_argument('--sortUsing', - help='Indicate which method should be used for ' - 'sorting. The value is computed for each row.' - 'Note that the region_length option will lead ' - 'to a dotted line within the heatmap that indicates ' - 'the end of the regions. (Default: %(default)s)', - choices=["mean", "median", "max", "min", "sum", - "region_length"], - default='mean') - - optional.add_argument('--sortUsingSamples', - help='List of sample numbers (order as in matrix), ' - 'that are used for sorting by --sortUsing, ' - 'no value uses all samples, ' - 'example: --sortUsingSamples 1 3', - type=int, nargs='+') - - optional.add_argument('--averageTypeBins', - default='mean', - choices=["mean", "median", "min", - "max", "std", "sum"], - help='Define the type of statistic that should be ' - 'used over the bin size range. The ' - 'options are: "mean", "median", "min", "max", "sum" ' - 'and "std". The default is "mean". (Default: %(default)s)') - - optional.add_argument('--missingDataAsZero', - help='If set, missing data (NAs) will be treated as zeros. ' - 'The default is to ignore such cases, which will be depicted as black areas in ' - 'a heatmap. (see the --missingDataColor argument ' - 'of the plotHeatmap command for additional options).', - action='store_true') - - optional.add_argument('--skipZeros', - help='Whether regions with only scores of zero ' - 'should be included or not. Default is to include ' - 'them.', - action='store_true') - - optional.add_argument('--minThreshold', - default=None, - type=float, - help='Numeric value. Any region containing a ' - 'value that is less than or equal to this ' - 'will be skipped. This is useful to skip, ' - 'for example, genes where the read count is zero ' - 'for any of the bins. This could be the result of ' - 'unmappable areas and can bias the overall results. (Default: %(default)s)') - - optional.add_argument('--maxThreshold', - default=None, - type=float, - help='Numeric value. Any region containing a value ' - 'greater than or equal to this ' - 'will be skipped. The maxThreshold is useful to ' - 'skip those few regions with very high read counts ' - '(e.g. micro satellites) that may bias the average ' - 'values. (Default: %(default)s)') - - optional.add_argument('--blackListFileName', '-bl', - help="A BED file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered.", - metavar="BED file", - required=False) - - optional.add_argument('--samplesLabel', - help='Labels for the samples. This will then be passed to plotHeatmap and plotProfile. The ' - 'default is to use the file name of the ' - 'sample. The sample labels should be separated ' - 'by spaces and quoted if a label itself' - 'contains a space E.g. --samplesLabel label-1 "label 2" ', - nargs='+') - - optional.add_argument('--smartLabels', - action='store_true', - help='Instead of manually specifying labels for the input ' - 'bigWig and BED/GTF files, this causes deepTools to use the file name ' - 'after removing the path and extension.') + optional.add_argument( + "--regionBodyLength", help=argparse.SUPPRESS, default=0, type=int + ) + optional.add_argument( + "--unscaled5prime", default=0, type=int, help=argparse.SUPPRESS + ) + optional.add_argument( + "--unscaled3prime", default=0, type=int, help=argparse.SUPPRESS + ) + optional.add_argument( + "--beforeRegionStartLength", + "-b", + "--upstream", + default=500, + type=int, + metavar="INT bp", + help="Distance upstream of the reference-point " + "selected. (Default: %(default)s)", + ) + optional.add_argument( + "--afterRegionStartLength", + "-a", + "--downstream", + default=1500, + metavar="INT bp", + type=int, + help="Distance downstream of the " + "reference-point selected. (Default: %(default)s)", + ) + optional.add_argument( + "--nanAfterEnd", + action="store_true", + help="If set, any values after the region end " + "are discarded. This is useful to visualize " + "the region end when not using the " + "scale-regions mode and when the reference-" + "point is set to the TSS.", + ) + + optional.add_argument( + "--binSize", + "-bs", + help="Length, in bases, of the non-overlapping " + "bins for averaging the score over the " + "regions length. (Default: %(default)s)", + type=int, + default=10, + ) + + optional.add_argument( + "--sortRegions", + help="Whether the output file should present the " + "regions sorted. The default is to not sort the regions. " + "Note that this is only useful if you plan to plot " + "the results yourself and not, for example, with " + "plotHeatmap, which will override this. Note also that " + "unsorted output will be in whatever order the regions " + "happen to be processed in and not match the order in " + "the input files. If you require the output order to " + "match that of the input regions, then either specify " + '"keep" or use computeMatrixOperations to resort the ' + "results file. (Default: %(default)s)", + choices=["descend", "ascend", "no", "keep"], + default="keep", + ) + + optional.add_argument( + "--sortUsing", + help="Indicate which method should be used for " + "sorting. The value is computed for each row." + "Note that the region_length option will lead " + "to a dotted line within the heatmap that indicates " + "the end of the regions. (Default: %(default)s)", + choices=["mean", "median", "max", "min", "sum", "region_length"], + default="mean", + ) + + optional.add_argument( + "--sortUsingSamples", + help="List of sample numbers (order as in matrix), " + "that are used for sorting by --sortUsing, " + "no value uses all samples, " + "example: --sortUsingSamples 1 3", + type=int, + nargs="+", + ) + + optional.add_argument( + "--averageTypeBins", + default="mean", + choices=["mean", "median", "min", "max", "std", "sum"], + help="Define the type of statistic that should be " + "used over the bin size range. The " + 'options are: "mean", "median", "min", "max", "sum" ' + 'and "std". The default is "mean". (Default: %(default)s)', + ) + + optional.add_argument( + "--missingDataAsZero", + help="If set, missing data (NAs) will be treated as zeros. " + "The default is to ignore such cases, which will be depicted as black areas in " + "a heatmap. (see the --missingDataColor argument " + "of the plotHeatmap command for additional options).", + action="store_true", + ) + + optional.add_argument( + "--skipZeros", + help="Whether regions with only scores of zero " + "should be included or not. Default is to include " + "them.", + action="store_true", + ) + + optional.add_argument( + "--minThreshold", + default=None, + type=float, + help="Numeric value. Any region containing a " + "value that is less than or equal to this " + "will be skipped. This is useful to skip, " + "for example, genes where the read count is zero " + "for any of the bins. This could be the result of " + "unmappable areas and can bias the overall results. (Default: %(default)s)", + ) + + optional.add_argument( + "--maxThreshold", + default=None, + type=float, + help="Numeric value. Any region containing a value " + "greater than or equal to this " + "will be skipped. The maxThreshold is useful to " + "skip those few regions with very high read counts " + "(e.g. micro satellites) that may bias the average " + "values. (Default: %(default)s)", + ) + + optional.add_argument( + "--blackListFileName", + "-bl", + help="A BED file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered.", + metavar="BED file", + required=False, + ) + + optional.add_argument( + "--samplesLabel", + help="Labels for the samples. This will then be passed to plotHeatmap and plotProfile. The " + "default is to use the file name of the " + "sample. The sample labels should be separated " + "by spaces and quoted if a label itself" + 'contains a space E.g. --samplesLabel label-1 "label 2" ', + nargs="+", + ) + + optional.add_argument( + "--smartLabels", + action="store_true", + help="Instead of manually specifying labels for the input " + "bigWig and BED/GTF files, this causes deepTools to use the file name " + "after removing the path and extension.", + ) # in contrast to other tools, # computeMatrix by default outputs # messages and the --quiet flag supresses them - optional.add_argument('--quiet', '-q', - help='Set to remove any warning or processing ' - 'messages.', - action='store_true') - - optional.add_argument('--verbose', - help='Being VERY verbose in the status messages. --quiet will disable this.', - action='store_true') - - optional.add_argument('--scale', - help='If set, all values are multiplied by ' - 'this number. (Default: %(default)s)', - type=float, - default=1) - optional.add_argument('--numberOfProcessors', '-p', - help='Number of processors to use. Type "max/2" to ' - 'use half the maximum number of processors or "max" ' - 'to use all available processors. (Default: %(default)s)', - metavar="INT", - type=numberOfProcessors, - default=1, - required=False) + optional.add_argument( + "--quiet", + "-q", + help="Set to remove any warning or processing " "messages.", + action="store_true", + ) + + optional.add_argument( + "--verbose", + help="Being VERY verbose in the status messages. --quiet will disable this.", + action="store_true", + ) + + optional.add_argument( + "--scale", + help="If set, all values are multiplied by " + "this number. (Default: %(default)s)", + type=float, + default=1, + ) + optional.add_argument( + "--numberOfProcessors", + "-p", + help='Number of processors to use. Type "max/2" to ' + 'use half the maximum number of processors or "max" ' + "to use all available processors. (Default: %(default)s)", + metavar="INT", + type=numberOfProcessors, + default=1, + required=False, + ) return parser @@ -356,42 +444,43 @@ def process_args(args=None): if args.quiet is True: args.verbose = False - if args.command == 'scale-regions': + if args.command == "scale-regions": args.nanAfterEnd = False args.referencePoint = None - elif args.command == 'reference-point': - if args.beforeRegionStartLength == 0 and \ - args.afterRegionStartLength == 0: - sys.exit("\nUpstrean and downstream regions are both " - "set to 0. Nothing to output. Maybe you want to " - "use the scale-regions mode?\n") + elif args.command == "reference-point": + if args.beforeRegionStartLength == 0 and args.afterRegionStartLength == 0: + sys.exit( + "\nUpstrean and downstream regions are both " + "set to 0. Nothing to output. Maybe you want to " + "use the scale-regions mode?\n" + ) - return(args) + return args def main(args=None): - args = process_args(args) - parameters = {'upstream': args.beforeRegionStartLength, - 'downstream': args.afterRegionStartLength, - 'body': args.regionBodyLength, - 'bin size': args.binSize, - 'ref point': args.referencePoint, - 'verbose': args.verbose, - 'bin avg type': args.averageTypeBins, - 'missing data as zero': args.missingDataAsZero, - 'min threshold': args.minThreshold, - 'max threshold': args.maxThreshold, - 'scale': args.scale, - 'skip zeros': args.skipZeros, - 'nan after end': args.nanAfterEnd, - 'proc number': args.numberOfProcessors, - 'sort regions': args.sortRegions, - 'sort using': args.sortUsing, - 'unscaled 5 prime': args.unscaled5prime, - 'unscaled 3 prime': args.unscaled3prime - } + parameters = { + "upstream": args.beforeRegionStartLength, + "downstream": args.afterRegionStartLength, + "body": args.regionBodyLength, + "bin size": args.binSize, + "ref point": args.referencePoint, + "verbose": args.verbose, + "bin avg type": args.averageTypeBins, + "missing data as zero": args.missingDataAsZero, + "min threshold": args.minThreshold, + "max threshold": args.maxThreshold, + "scale": args.scale, + "skip zeros": args.skipZeros, + "nan after end": args.nanAfterEnd, + "proc number": args.numberOfProcessors, + "sort regions": args.sortRegions, + "sort using": args.sortUsing, + "unscaled 5 prime": args.unscaled5prime, + "unscaled 3 prime": args.unscaled3prime, + } hm = heatmapper.heatmapper() @@ -401,7 +490,11 @@ def main(args=None): if db.isDeepBlue(fname): deepBlueFiles.append([fname, idx]) if len(deepBlueFiles) > 0: - sys.stderr.write("Preloading the following deepBlue files: {}\n".format(",".join([x[0] for x in deepBlueFiles]))) + sys.stderr.write( + "Preloading the following deepBlue files: {}\n".format( + ",".join([x[0] for x in deepBlueFiles]) + ) + ) regs = db.makeRegions(args.regionsFileName, args) for x in deepBlueFiles: x.extend([args, regs]) @@ -412,28 +505,49 @@ def main(args=None): res = list(map(db.preloadWrapper, deepBlueFiles)) # substitute the file names with the temp files - for (ftuple, r) in zip(deepBlueFiles, res): + for ftuple, r in zip(deepBlueFiles, res): args.scoreFileName[ftuple[1]] = r deepBlueFiles = [[x[0], x[1]] for x in deepBlueFiles] del regs scores_file_list = args.scoreFileName - hm.computeMatrix(scores_file_list, args.regionsFileName, parameters, blackListFileName=args.blackListFileName, verbose=args.verbose, allArgs=args) - if args.sortRegions not in ['no', 'keep']: + hm.computeMatrix( + scores_file_list, + args.regionsFileName, + parameters, + blackListFileName=args.blackListFileName, + verbose=args.verbose, + allArgs=args, + ) + if args.sortRegions not in ["no", "keep"]: sortUsingSamples = [] if args.sortUsingSamples is not None: for i in args.sortUsingSamples: - if (i > 0 and i <= hm.matrix.get_num_samples()): + if i > 0 and i <= hm.matrix.get_num_samples(): sortUsingSamples.append(i - 1) else: - exit("The value {0} for --sortUsingSamples is not valid. Only values from 1 to {1} are allowed.".format(args.sortUsingSamples, hm.matrix.get_num_samples())) - print('Samples used for ordering within each group: ', sortUsingSamples) - - hm.matrix.sort_groups(sort_using=args.sortUsing, sort_method=args.sortRegions, sample_list=sortUsingSamples) - elif args.sortRegions == 'keep': - hm.parameters['group_labels'] = hm.matrix.group_labels + exit( + "The value {0} for --sortUsingSamples is not valid. Only values from 1 to {1} are allowed.".format( + args.sortUsingSamples, hm.matrix.get_num_samples() + ) + ) + print("Samples used for ordering within each group: ", sortUsingSamples) + + hm.matrix.sort_groups( + sort_using=args.sortUsing, + sort_method=args.sortRegions, + sample_list=sortUsingSamples, + ) + elif args.sortRegions == "keep": + hm.parameters["group_labels"] = hm.matrix.group_labels hm.parameters["group_boundaries"] = hm.matrix.group_boundaries - cmo.sortMatrix(hm, args.regionsFileName, args.transcriptID, args.transcript_id_designator, verbose=not args.quiet) + cmo.sortMatrix( + hm, + args.regionsFileName, + args.transcriptID, + args.transcript_id_designator, + verbose=not args.quiet, + ) hm.save_matrix(args.outFileName) diff --git a/deeptools/computeMatrixOperations.py b/deeptools/computeMatrixOperations.py index deb62076c..56138821b 100755 --- a/deeptools/computeMatrixOperations.py +++ b/deeptools/computeMatrixOperations.py @@ -50,249 +50,272 @@ def parse_arguments(): computeMatrixOperations dataRange -h """, - epilog='example usages:\n' - 'computeMatrixOperations subset -m input.mat.gz -o output.mat.gz --group "group 1" "group 2" --samples "sample 3" "sample 10"\n\n' - ' \n\n') + epilog="example usages:\n" + 'computeMatrixOperations subset -m input.mat.gz -o output.mat.gz --group "group 1" "group 2" --samples "sample 3" "sample 10"\n\n' + " \n\n", + ) - subparsers = parser.add_subparsers( - title='Commands', - dest='command', - metavar='') + subparsers = parser.add_subparsers(title="Commands", dest="command", metavar="") # info subparsers.add_parser( - 'info', + "info", formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[infoArgs()], help="Print group and sample information", - usage='An example usage is:\n computeMatrixOperations info -m input.mat.gz\n\n') + usage="An example usage is:\n computeMatrixOperations info -m input.mat.gz\n\n", + ) # relabel subparsers.add_parser( - 'relabel', + "relabel", formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[infoArgs(), relabelArgs()], help="Change sample and/or group label information", - usage='An example usage is:\n computeMatrixOperations relabel -m input.mat.gz -o output.mat.gz --sampleLabels "sample 1" "sample 2"\n\n') + usage='An example usage is:\n computeMatrixOperations relabel -m input.mat.gz -o output.mat.gz --sampleLabels "sample 1" "sample 2"\n\n', + ) # subset subparsers.add_parser( - 'subset', + "subset", formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[infoArgs(), subsetArgs()], help="Actually subset the matrix. The group and sample orders are honored, so one can also reorder files.", - usage='An example usage is:\n computeMatrixOperations subset -m ' + usage="An example usage is:\n computeMatrixOperations subset -m " 'input.mat.gz -o output.mat.gz --groups "group 1" "group 2" ' - '--samples "sample 3" "sample 10"\n\n') + '--samples "sample 3" "sample 10"\n\n', + ) # filterStrand subparsers.add_parser( - 'filterStrand', + "filterStrand", formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[infoArgs(), filterStrandArgs()], help="Filter entries by strand.", - usage='Example usage:\n computeMatrixOperations filterStrand -m ' - 'input.mat.gz -o output.mat.gz --strand +\n\n') + usage="Example usage:\n computeMatrixOperations filterStrand -m " + "input.mat.gz -o output.mat.gz --strand +\n\n", + ) # filterValues subparsers.add_parser( - 'filterValues', + "filterValues", formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[infoArgs(), filterValuesArgs()], help="Filter entries by min/max value.", - usage='Example usage:\n computeMatrixOperations filterValues -m ' - 'input.mat.gz -o output.mat.gz --min 10 --max 1000\n\n') + usage="Example usage:\n computeMatrixOperations filterValues -m " + "input.mat.gz -o output.mat.gz --min 10 --max 1000\n\n", + ) # rbind subparsers.add_parser( - 'rbind', + "rbind", formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[bindArgs()], help="merge multiple matrices by concatenating them head to tail. This assumes that the same samples are present in each in the same order.", - usage='Example usage:\n computeMatrixOperations rbind -m ' - 'input1.mat.gz input2.mat.gz -o output.mat.gz\n\n') + usage="Example usage:\n computeMatrixOperations rbind -m " + "input1.mat.gz input2.mat.gz -o output.mat.gz\n\n", + ) # cbind subparsers.add_parser( - 'cbind', + "cbind", formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[bindArgs()], help="merge multiple matrices by concatenating them left to right. No assumptions are made about the row order. Regions not present in the first file specified are ignored. Regions missing in subsequent files will result in NAs. Regions are matches based on the first 6 columns of the computeMatrix output (essentially the columns in a BED file).", - usage='Example usage:\n computeMatrixOperations cbind -m ' - 'input1.mat.gz input2.mat.gz -o output.mat.gz\n\n') + usage="Example usage:\n computeMatrixOperations cbind -m " + "input1.mat.gz input2.mat.gz -o output.mat.gz\n\n", + ) # sort subparsers.add_parser( - 'sort', + "sort", formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[sortArgs()], - help='Sort a matrix file to correspond to the order of entries in the desired input file(s). The groups of regions designated by the files must be present in the order found in the output of computeMatrix (otherwise, use the subset command first). Note that this subcommand can also be used to remove unwanted regions, since regions not present in the input file(s) will be omitted from the output.', - usage='Example usage:\n computeMatrixOperations sort -m input.mat.gz -R regions1.bed regions2.bed regions3.gtf -o input.sorted.mat.gz\n\n') + help="Sort a matrix file to correspond to the order of entries in the desired input file(s). The groups of regions designated by the files must be present in the order found in the output of computeMatrix (otherwise, use the subset command first). Note that this subcommand can also be used to remove unwanted regions, since regions not present in the input file(s) will be omitted from the output.", + usage="Example usage:\n computeMatrixOperations sort -m input.mat.gz -R regions1.bed regions2.bed regions3.gtf -o input.sorted.mat.gz\n\n", + ) # dataRange subparsers.add_parser( - 'dataRange', + "dataRange", formatter_class=argparse.ArgumentDefaultsHelpFormatter, parents=[infoArgs()], - help='Returns the min, max, median, 10th and 90th percentile of the matrix values per sample.', - usage='Example usage:\n computeMatrixOperations dataRange -m input.mat.gz\n\n') + help="Returns the min, max, median, 10th and 90th percentile of the matrix values per sample.", + usage="Example usage:\n computeMatrixOperations dataRange -m input.mat.gz\n\n", + ) - parser.add_argument('--version', action='version', - version='%(prog)s {}'.format(__version__)) + parser.add_argument( + "--version", action="version", version="%(prog)s {}".format(__version__) + ) return parser def bindArgs(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") - required.add_argument('--matrixFile', '-m', - help='Matrix files from the computeMatrix tool.', - nargs='+', - required=True) + required.add_argument( + "--matrixFile", + "-m", + help="Matrix files from the computeMatrix tool.", + nargs="+", + required=True, + ) - required.add_argument('--outFileName', '-o', - help='Output file name', - required=True) + required.add_argument("--outFileName", "-o", help="Output file name", required=True) return parser def infoArgs(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") - required.add_argument('--matrixFile', '-m', - help='Matrix file from the computeMatrix tool.', - required=True) + required.add_argument( + "--matrixFile", + "-m", + help="Matrix file from the computeMatrix tool.", + required=True, + ) return parser def relabelArgs(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") - required.add_argument('--outFileName', '-o', - help='Output file name', - required=True) + required.add_argument("--outFileName", "-o", help="Output file name", required=True) - optional = parser.add_argument_group('Optional arguments') + optional = parser.add_argument_group("Optional arguments") - optional.add_argument('--groupLabels', - nargs='+', - help="Groups labels. If none are specified then the current labels will be kept.") + optional.add_argument( + "--groupLabels", + nargs="+", + help="Groups labels. If none are specified then the current labels will be kept.", + ) - optional.add_argument('--sampleLabels', - nargs='+', - help="Sample labels. If none are specified then the current labels will be kept.") + optional.add_argument( + "--sampleLabels", + nargs="+", + help="Sample labels. If none are specified then the current labels will be kept.", + ) return parser def subsetArgs(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") - required.add_argument('--outFileName', '-o', - help='Output file name', - required=True) + required.add_argument("--outFileName", "-o", help="Output file name", required=True) - optional = parser.add_argument_group('Optional arguments') + optional = parser.add_argument_group("Optional arguments") - optional.add_argument('--groups', - nargs='+', - help="Groups to include. If none are specified then all will be included.") + optional.add_argument( + "--groups", + nargs="+", + help="Groups to include. If none are specified then all will be included.", + ) - optional.add_argument('--samples', - nargs='+', - help="Samples to include. If none are specified then all will be included.") + optional.add_argument( + "--samples", + nargs="+", + help="Samples to include. If none are specified then all will be included.", + ) return parser def filterStrandArgs(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") - required.add_argument('--outFileName', '-o', - help='Output file name', - required=True) + required.add_argument("--outFileName", "-o", help="Output file name", required=True) - required.add_argument('--strand', '-s', - help='Strand', - choices=['+', '-', '.'], - required=True) + required.add_argument( + "--strand", "-s", help="Strand", choices=["+", "-", "."], required=True + ) return parser def filterValuesArgs(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") - required.add_argument('--outFileName', '-o', - help='Output file name', - required=True) + required.add_argument("--outFileName", "-o", help="Output file name", required=True) - optional = parser.add_argument_group('Optional arguments') - optional.add_argument('--min', - help='Minimum value. Any row having a single entry less than this will be excluded. The default is no minimum.', - type=float, - default=None) + optional = parser.add_argument_group("Optional arguments") + optional.add_argument( + "--min", + help="Minimum value. Any row having a single entry less than this will be excluded. The default is no minimum.", + type=float, + default=None, + ) - optional.add_argument('--max', - help='Maximum value. Any row having a single entry more than this will be excluded. The default is no maximum.', - type=float, - default=None) + optional.add_argument( + "--max", + help="Maximum value. Any row having a single entry more than this will be excluded. The default is no maximum.", + type=float, + default=None, + ) return parser def sortArgs(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') - - required.add_argument('--matrixFile', '-m', - help='Matrix file from the computeMatrix tool.', - required=True) - - required.add_argument('--outFileName', '-o', - help='Output file name', - required=True) - - required.add_argument('--regionsFileName', '-R', - help='File name(s), in BED or GTF format, containing the regions. ' - 'If multiple bed files are given, each one is ' - 'considered a group that can be plotted separately. ' - 'Also, adding a "#" symbol in the bed file causes all ' - 'the regions until the previous "#" to be considered ' - 'one group. Alternatively for BED files, putting ' - 'deepTools_group in the header can be used to indicate a ' - 'column with group labels. Note that these should be ' - 'sorted such that all group entries are together.', - required=True, - nargs='+') - - optional = parser.add_argument_group('Optional arguments') - - optional.add_argument('--transcriptID', - default='transcript', - help='When a GTF file is used to provide regions, only ' - 'entries with this value as their feature (column 3) ' - 'will be processed as transcripts. (Default: %(default)s)') - - optional.add_argument('--transcript_id_designator', - default='transcript_id', - help='Each region has an ID (e.g., ACTB) assigned to it, ' - 'which for BED files is either column 4 (if it exists) ' - 'or the interval bounds. For GTF files this is instead ' - 'stored in the last column as a key:value pair (e.g., as ' - '\'transcript_id "ACTB"\', for a key of transcript_id ' - 'and a value of ACTB). In some cases it can be ' - 'convenient to use a different identifier. To do so, set ' - 'this to the desired key. (Default: %(default)s)') + required = parser.add_argument_group("Required arguments") + + required.add_argument( + "--matrixFile", + "-m", + help="Matrix file from the computeMatrix tool.", + required=True, + ) + + required.add_argument("--outFileName", "-o", help="Output file name", required=True) + + required.add_argument( + "--regionsFileName", + "-R", + help="File name(s), in BED or GTF format, containing the regions. " + "If multiple bed files are given, each one is " + "considered a group that can be plotted separately. " + 'Also, adding a "#" symbol in the bed file causes all ' + 'the regions until the previous "#" to be considered ' + "one group. Alternatively for BED files, putting " + "deepTools_group in the header can be used to indicate a " + "column with group labels. Note that these should be " + "sorted such that all group entries are together.", + required=True, + nargs="+", + ) + + optional = parser.add_argument_group("Optional arguments") + + optional.add_argument( + "--transcriptID", + default="transcript", + help="When a GTF file is used to provide regions, only " + "entries with this value as their feature (column 3) " + "will be processed as transcripts. (Default: %(default)s)", + ) + + optional.add_argument( + "--transcript_id_designator", + default="transcript_id", + help="Each region has an ID (e.g., ACTB) assigned to it, " + "which for BED files is either column 4 (if it exists) " + "or the interval bounds. For GTF files this is instead " + "stored in the last column as a key:value pair (e.g., as " + "'transcript_id \"ACTB\"', for a key of transcript_id " + "and a value of ACTB). In some cases it can be " + "convenient to use a different identifier. To do so, set " + "this to the desired key. (Default: %(default)s)", + ) return parser @@ -320,11 +343,16 @@ def printDataRange(matrix): start = matrix.matrix.sample_boundaries[i] end = matrix.matrix.sample_boundaries[i + 1] sample_matrix = matrix.matrix.matrix[..., start:end] - print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(sample, np.amin(sample_matrix), - np.amax(sample_matrix), - np.ma.median(sample_matrix), - np.percentile(sample_matrix, 10), - np.percentile(sample_matrix, 90))) + print( + "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format( + sample, + np.amin(sample_matrix), + np.amax(sample_matrix), + np.ma.median(sample_matrix), + np.percentile(sample_matrix, 10), + np.percentile(sample_matrix, 90), + ) + ) def relabelMatrix(matrix, args): @@ -333,11 +361,19 @@ def relabelMatrix(matrix, args): """ if args.groupLabels: if len(args.groupLabels) != len(matrix.matrix.group_labels): - sys.exit("You specified {} group labels, but {} are required.\n".format(len(args.groupLabels), len(matrix.matrix.group_labels))) + sys.exit( + "You specified {} group labels, but {} are required.\n".format( + len(args.groupLabels), len(matrix.matrix.group_labels) + ) + ) matrix.matrix.group_labels = args.groupLabels if args.sampleLabels: if len(args.sampleLabels) != len(matrix.matrix.sample_labels): - sys.exit("You specified {} sample labels, but {} are required.\n".format(len(args.sampleLabels), len(matrix.matrix.sample_labels))) + sys.exit( + "You specified {} sample labels, but {} are required.\n".format( + len(args.sampleLabels), len(matrix.matrix.sample_labels) + ) + ) matrix.matrix.sample_labels = args.sampleLabels @@ -345,7 +381,7 @@ def getGroupBounds(args, matrix): """ Given the group labels, return an indexing array and the resulting boundaries """ - bounds = matrix.parameters['group_boundaries'] + bounds = matrix.parameters["group_boundaries"] if args.groups is None: return range(0, matrix.matrix.matrix.shape[0]), np.array(bounds) else: @@ -364,7 +400,7 @@ def getSampleBounds(args, matrix): """ Given the sample labels, return an indexing array """ - bounds = matrix.parameters['sample_boundaries'] + bounds = matrix.parameters["sample_boundaries"] if args.samples is None: return np.arange(0, matrix.matrix.matrix.shape[1]) else: @@ -389,7 +425,9 @@ def subsetRegions(hm, bounds): ends = reg["end"].split(",") ends = [int(x) for x in ends] regs = [(x, y) for x, y in zip(starts, ends)] - out.append([reg["chrom"], regs, reg["name"], 0, reg["strand"], reg["score"]]) + out.append( + [reg["chrom"], regs, reg["name"], 0, reg["strand"], reg["score"]] + ) else: out.append(reg) return out @@ -409,7 +447,15 @@ def filterHeatmap(hm, args): # Get the new bounds for idx in range(1, len(hm.matrix.group_boundaries)): - i = int(np.sum(keep[hm.matrix.group_boundaries[idx - 1]:hm.matrix.group_boundaries[idx]])) + i = int( + np.sum( + keep[ + hm.matrix.group_boundaries[idx - 1] : hm.matrix.group_boundaries[ + idx + ] + ] + ) + ) bounds.append(bounds[idx - 1] + i) hm.matrix.group_boundaries = bounds @@ -427,8 +473,10 @@ def filterHeatmapValues(hm, minVal, maxVal): minVal = -np.inf if maxVal is None: maxVal = np.inf - np.warnings.filterwarnings('ignore') - for i, (x, y) in enumerate(zip(np.nanmin(hm.matrix.matrix, axis=1), np.nanmax(hm.matrix.matrix, axis=1))): + np.warnings.filterwarnings("ignore") + for i, (x, y) in enumerate( + zip(np.nanmin(hm.matrix.matrix, axis=1), np.nanmax(hm.matrix.matrix, axis=1)) + ): # x/y will be nan iff a row is entirely nan. Don't filter. if np.isnan(x) or (x >= minVal and y <= maxVal): keep.append(True) @@ -439,7 +487,15 @@ def filterHeatmapValues(hm, minVal, maxVal): # Get the new bounds for idx in range(1, len(hm.matrix.group_boundaries)): - i = int(np.sum(keep[hm.matrix.group_boundaries[idx - 1]:hm.matrix.group_boundaries[idx]])) + i = int( + np.sum( + keep[ + hm.matrix.group_boundaries[idx - 1] : hm.matrix.group_boundaries[ + idx + ] + ] + ) + ) bounds.append(bounds[idx - 1] + i) hm.matrix.group_boundaries = bounds @@ -463,7 +519,9 @@ def insertMatrix(hm, hm2, groupName): hm2End = hm2.parameters["group_boundaries"][idx2 + 1] # Insert the subset hm2 into hm along axis 0 - hm.matrix.matrix = np.insert(hm.matrix.matrix, hmEnd, hm2.matrix.matrix[hm2Start:hm2End, :], axis=0) + hm.matrix.matrix = np.insert( + hm.matrix.matrix, hmEnd, hm2.matrix.matrix[hm2Start:hm2End, :], axis=0 + ) # Insert the regions hm.matrix.regions[hmEnd:hmEnd] = hm2.matrix.regions[hm2Start:hm2End] @@ -488,9 +546,13 @@ def appendMatrix(hm, hm2, groupName): hm2End = hm2.parameters["group_boundaries"][idx2 + 1] # Append the matrix - hm.matrix.matrix = np.concatenate([hm.matrix.matrix, hm2.matrix.matrix[hm2Start:hm2End, :]], axis=0) + hm.matrix.matrix = np.concatenate( + [hm.matrix.matrix, hm2.matrix.matrix[hm2Start:hm2End, :]], axis=0 + ) # Update the bounds - hm.parameters["group_boundaries"].append(hm.parameters["group_boundaries"][-1] + hm2End - hm2Start) + hm.parameters["group_boundaries"].append( + hm.parameters["group_boundaries"][-1] + hm2End - hm2Start + ) # Append the regions hm.matrix.regions.extend(hm2.matrix.regions[hm2Start:hm2End]) @@ -513,8 +575,8 @@ def rbindMatrices(hm, args): hm.parameters["group_labels"].append(group) # Update the group boundaries attribute - hm.matrix.group_labels = hm.parameters['group_labels'] - hm.matrix.group_boundaries = hm.parameters['group_boundaries'] + hm.matrix.group_labels = hm.parameters["group_labels"] + hm.matrix.group_boundaries = hm.parameters["group_boundaries"] def cbindMatrices(hm, args): @@ -538,14 +600,19 @@ def cbindMatrices(hm, args): for idx in range(1, len(args.matrixFile)): hm2.read_matrix_file(args.matrixFile[idx]) # Add the sample labels - hm.parameters['sample_labels'].extend(hm2.parameters['sample_labels']) + hm.parameters["sample_labels"].extend(hm2.parameters["sample_labels"]) # Add the sample boundaries - lens = [x + hm.parameters['sample_boundaries'][-1] for x in hm2.parameters['sample_boundaries']][1:] - hm.parameters['sample_boundaries'].extend(lens) + lens = [ + x + hm.parameters["sample_boundaries"][-1] + for x in hm2.parameters["sample_boundaries"] + ][1:] + hm.parameters["sample_boundaries"].extend(lens) # Add on additional NA initialized columns ncol = hm.matrix.matrix.shape[1] - hm.matrix.matrix = np.hstack((hm.matrix.matrix, np.empty(hm2.matrix.matrix.shape))) + hm.matrix.matrix = np.hstack( + (hm.matrix.matrix, np.empty(hm2.matrix.matrix.shape)) + ) hm.matrix.matrix[:, ncol:] = np.NAN # Update the values @@ -557,15 +624,17 @@ def cbindMatrices(hm, args): for idx3, reg in enumerate(hm2.matrix.regions[s:e]): if reg[2] not in d[group]: continue - hm.matrix.matrix[d[group][reg[2]], ncol:] = hm2.matrix.matrix[s + idx3, :] + hm.matrix.matrix[d[group][reg[2]], ncol:] = hm2.matrix.matrix[ + s + idx3, : + ] # Append the special params for s in hm.special_params: hm.parameters[s].extend(hm2.parameters[s]) # Update the sample parameters - hm.matrix.sample_labels = hm.parameters['sample_labels'] - hm.matrix.sample_boundaries = hm.parameters['sample_boundaries'] + hm.matrix.sample_labels = hm.parameters["sample_labels"] + hm.matrix.sample_boundaries = hm.parameters["sample_boundaries"] def loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup): @@ -601,7 +670,9 @@ def loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup): if len(label): labels[dti.findRandomLabel(labels, label)] = len(labels) else: - labels[dti.findRandomLabel(labels, os.path.basename(fname))] = len(labels) + labels[dti.findRandomLabel(labels, os.path.basename(fname))] = len( + labels + ) regions.append(localRegions) localRegions = dict() continue @@ -637,7 +708,7 @@ def loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup): def loadGTFtranscript(cols, label, defaultGroup, transcript_id_designator): - s = next(csv.reader([cols[8]], delimiter=' ')) + s = next(csv.reader([cols[8]], delimiter=" ")) if "deepTools_group" in s and s[-1] != "deepTools_group": label = s[s.index("deepTools_group") + 1].rstrip(";") elif defaultGroup is not None: @@ -651,7 +722,16 @@ def loadGTFtranscript(cols, label, defaultGroup, transcript_id_designator): return label, name -def loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_designator, defaultGroup): +def loadGTF( + line, + fp, + fname, + labels, + regions, + transcriptID, + transcript_id_designator, + defaultGroup, +): """ Like loadBED, but for a GTF file @@ -662,7 +742,9 @@ def loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_design # handle the first line cols = line.split("\t") if cols[2].lower() == transcriptID.lower(): - label, name = loadGTFtranscript(cols, file_label, defaultGroup, transcript_id_designator) + label, name = loadGTFtranscript( + cols, file_label, defaultGroup, transcript_id_designator + ) if label is not None: if label not in labels: labels[label] = len(labels) @@ -672,13 +754,15 @@ def loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_design for line in fp: if not isinstance(line, str): - line = line.decode('ascii') - if not line.startswith('#'): - cols = line.strip().split('\t') + line = line.decode("ascii") + if not line.startswith("#"): + cols = line.strip().split("\t") if len(cols) == 0: continue if cols[2].lower() == transcriptID: - label, name = loadGTFtranscript(cols, file_label, defaultGroup, transcript_id_designator) + label, name = loadGTFtranscript( + cols, file_label, defaultGroup, transcript_id_designator + ) if label is None: continue if label not in labels: @@ -688,7 +772,9 @@ def loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_design regions[labelIdx][name] = len(regions[labelIdx]) -def sortMatrix(hm, regionsFileName, transcriptID, transcript_id_designator, verbose=True): +def sortMatrix( + hm, regionsFileName, transcriptID, transcript_id_designator, verbose=True +): """ Iterate through the files noted by regionsFileName and sort hm accordingly """ @@ -717,29 +803,44 @@ def sortMatrix(hm, regionsFileName, transcriptID, transcript_id_designator, verb # Determine the file type and load into a list (or list of lists) cols = line.strip().split("\t") if len(cols) - subtract < 3: - raise RuntimeError('{0} does not seem to be a recognized file type!'.format(fname)) + raise RuntimeError( + "{0} does not seem to be a recognized file type!".format(fname) + ) elif len(cols) - subtract <= 6: loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup) elif len(cols) and dti.seemsLikeGTF(cols): - loadGTF(line, fp, fname, labels, regions, transcriptID, transcript_id_designator, defaultGroup) + loadGTF( + line, + fp, + fname, + labels, + regions, + transcriptID, + transcript_id_designator, + defaultGroup, + ) else: loadBED(line, fp, fname, labelColumn, labels, regions, defaultGroup) fp.close() # Do some sanity checking on the group labels and region names within them - s1 = set(hm.parameters['group_labels']) + s1 = set(hm.parameters["group_labels"]) if verbose: for e in labels: if e not in s1: - sys.exit("The computeMatrix output is missing the '{}' region group. It has {} but the specified regions have {}.\n".format(e, s1, labels.keys())) + sys.exit( + "The computeMatrix output is missing the '{}' region group. It has {} but the specified regions have {}.\n".format( + e, s1, labels.keys() + ) + ) # Make a dictionary out of current labels and regions d = dict() pos = 0 groupSizes = dict() - for idx, label in enumerate(hm.parameters['group_labels']): - s = hm.parameters['group_boundaries'][idx] - e = hm.parameters['group_boundaries'][idx + 1] + for idx, label in enumerate(hm.parameters["group_labels"]): + s = hm.parameters["group_boundaries"][idx] + e = hm.parameters["group_boundaries"][idx + 1] if label not in labels: continue d[label] = dict() @@ -767,7 +868,11 @@ def sortMatrix(hm, regionsFileName, transcriptID, transcript_id_designator, verb for name in _: if name not in d[label]: if verbose: - sys.stderr.write("Skipping {}, due to being absent in the computeMatrix output.\n".format(name)) + sys.stderr.write( + "Skipping {}, due to being absent in the computeMatrix output.\n".format( + name + ) + ) continue sz += 1 order.append(d[label][name]) @@ -795,11 +900,11 @@ def main(args=None): hm = heatmapper.heatmapper() if not isinstance(args.matrixFile, list): hm.read_matrix_file(args.matrixFile) - if args.command == 'info': + if args.command == "info": printInfo(hm) - elif args.command == 'dataRange': + elif args.command == "dataRange": printDataRange(hm) - elif args.command == 'subset': + elif args.command == "subset": sIdx = getSampleBounds(args, hm) gIdx, gBounds = getGroupBounds(args, hm) @@ -811,7 +916,9 @@ def main(args=None): # boundaries if args.samples is None: args.samples = hm.matrix.sample_labels - hm.matrix.sample_boundaries = hm.matrix.sample_boundaries[0:len(args.samples) + 1] + hm.matrix.sample_boundaries = hm.matrix.sample_boundaries[ + 0 : len(args.samples) + 1 + ] hm.matrix.group_boundaries = gBounds.tolist() # special params keepIdx = set() @@ -819,7 +926,9 @@ def main(args=None): if sample in args.samples: keepIdx.add(_) for param in hm.special_params: - hm.parameters[param] = [v for k, v in enumerate(hm.parameters[param]) if k in keepIdx] + hm.parameters[param] = [ + v for k, v in enumerate(hm.parameters[param]) if k in keepIdx + ] # labels hm.matrix.sample_labels = args.samples if args.groups is None: @@ -827,22 +936,24 @@ def main(args=None): hm.matrix.group_labels = args.groups # save hm.save_matrix(args.outFileName) - elif args.command == 'filterStrand': + elif args.command == "filterStrand": filterHeatmap(hm, args) hm.save_matrix(args.outFileName) - elif args.command == 'filterValues': + elif args.command == "filterValues": filterHeatmapValues(hm, args.min, args.max) hm.save_matrix(args.outFileName) - elif args.command == 'rbind': + elif args.command == "rbind": rbindMatrices(hm, args) hm.save_matrix(args.outFileName) - elif args.command == 'cbind': + elif args.command == "cbind": cbindMatrices(hm, args) hm.save_matrix(args.outFileName) - elif args.command == 'sort': - sortMatrix(hm, args.regionsFileName, args.transcriptID, args.transcript_id_designator) + elif args.command == "sort": + sortMatrix( + hm, args.regionsFileName, args.transcriptID, args.transcript_id_designator + ) hm.save_matrix(args.outFileName) - elif args.command == 'relabel': + elif args.command == "relabel": relabelMatrix(hm, args) hm.save_matrix(args.outFileName) else: diff --git a/deeptools/correctGCBias.py b/deeptools/correctGCBias.py index d69322483..3d82ff373 100755 --- a/deeptools/correctGCBias.py +++ b/deeptools/correctGCBias.py @@ -20,7 +20,7 @@ from deeptools import utilities from deeptools.bamHandler import openBam -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def parse_arguments(args=None): @@ -29,21 +29,22 @@ def parse_arguments(args=None): parser = argparse.ArgumentParser( parents=[requiredArgs, parentParser], formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='This tool corrects the GC-bias using the' - ' method proposed by [Benjamini & Speed (2012). ' - 'Nucleic Acids Research, 40(10)]. It will remove reads' - ' from regions with too high coverage compared to the' - ' expected values (typically GC-rich regions) and will' - ' add reads to regions where too few reads are seen ' - '(typically AT-rich regions). ' - 'The tool ``computeGCBias`` needs to be run first to generate the ' - 'frequency table needed here.', - usage='An example usage is:\n correctGCBias ' - '-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit ' - '--GCbiasFrequenciesFile freq.txt -o gc_corrected.bam ' - '[options]', - conflict_handler='resolve', - add_help=False) + description="This tool corrects the GC-bias using the" + " method proposed by [Benjamini & Speed (2012). " + "Nucleic Acids Research, 40(10)]. It will remove reads" + " from regions with too high coverage compared to the" + " expected values (typically GC-rich regions) and will" + " add reads to regions where too few reads are seen " + "(typically AT-rich regions). " + "The tool ``computeGCBias`` needs to be run first to generate the " + "frequency table needed here.", + usage="An example usage is:\n correctGCBias " + "-b file.bam --effectiveGenomeSize 2150570000 -g mm9.2bit " + "--GCbiasFrequenciesFile freq.txt -o gc_corrected.bam " + "[options]", + conflict_handler="resolve", + add_help=False, + ) return parser @@ -56,59 +57,74 @@ def process_args(args=None): def getRequiredArgs(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") # define the arguments - required.add_argument('--bamfile', '-b', - metavar='BAM file', - help='Sorted BAM file to correct.', - required=True) - required.add_argument('--effectiveGenomeSize', - help='The effective genome size is the portion ' - 'of the genome that is mappable. Large fractions of ' - 'the genome are stretches of NNNN that should be ' - 'discarded. Also, if repetitive regions were not ' - 'included in the mapping of reads, the effective ' - 'genome size needs to be adjusted accordingly. ' - 'A table of values is available here: ' - 'http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .', - default=None, - type=int, - required=True) - - required.add_argument('--genome', '-g', - help='Genome in two bit format. Most genomes can be ' - 'found here: http://hgdownload.cse.ucsc.edu/gbdb/ ' - 'Search for the .2bit ending. Otherwise, fasta ' - 'files can be converted to 2bit using faToTwoBit ' - 'available here: ' - 'http://hgdownload.cse.ucsc.edu/admin/exe/', - metavar='two bit file', - required=True) - - required.add_argument('--GCbiasFrequenciesFile', '-freq', - help='Indicate the output file from ' - 'computeGCBias containing ' - 'the observed and expected read frequencies per GC-' - 'content.', - type=argparse.FileType('r'), - metavar='FILE', - required=True) - - output = parser.add_argument_group('Output options') - output.add_argument('--correctedFile', '-o', - help='Name of the corrected file. The ending will ' - 'be used to decide the output file format. The options ' - 'are ".bam", ".bw" for a bigWig file, ".bg" for a ' - 'bedGraph file.', - metavar='FILE', - type=argparse.FileType('w'), - required=True) + required.add_argument( + "--bamfile", + "-b", + metavar="BAM file", + help="Sorted BAM file to correct.", + required=True, + ) + required.add_argument( + "--effectiveGenomeSize", + help="The effective genome size is the portion " + "of the genome that is mappable. Large fractions of " + "the genome are stretches of NNNN that should be " + "discarded. Also, if repetitive regions were not " + "included in the mapping of reads, the effective " + "genome size needs to be adjusted accordingly. " + "A table of values is available here: " + "http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .", + default=None, + type=int, + required=True, + ) + + required.add_argument( + "--genome", + "-g", + help="Genome in two bit format. Most genomes can be " + "found here: http://hgdownload.cse.ucsc.edu/gbdb/ " + "Search for the .2bit ending. Otherwise, fasta " + "files can be converted to 2bit using faToTwoBit " + "available here: " + "http://hgdownload.cse.ucsc.edu/admin/exe/", + metavar="two bit file", + required=True, + ) + + required.add_argument( + "--GCbiasFrequenciesFile", + "-freq", + help="Indicate the output file from " + "computeGCBias containing " + "the observed and expected read frequencies per GC-" + "content.", + type=argparse.FileType("r"), + metavar="FILE", + required=True, + ) + + output = parser.add_argument_group("Output options") + output.add_argument( + "--correctedFile", + "-o", + help="Name of the corrected file. The ending will " + "be used to decide the output file format. The options " + 'are ".bam", ".bw" for a bigWig file, ".bg" for a ' + "bedGraph file.", + metavar="FILE", + type=argparse.FileType("w"), + required=True, + ) # define the optional arguments - optional = parser.add_argument_group('Optional arguments') - optional.add_argument("--help", "-h", action="help", - help="show this help message and exit") + optional = parser.add_argument_group("Optional arguments") + optional.add_argument( + "--help", "-h", action="help", help="show this help message and exit" + ) return parser @@ -181,8 +197,8 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): i = 0 - tbit = py2bit.open(global_vars['2bit']) - bam = openBam(global_vars['bam']) + tbit = py2bit.open(global_vars["2bit"]) + bam = openBam(global_vars["bam"]) read_repetitions = 0 removed_duplicated_reads = 0 startTime = time.time() @@ -191,8 +207,7 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): # r.flag & 4 == 0 is to skip unmapped # reads that nevertheless are asigned # to a genomic position - reads = [r for r in bam.fetch(chrNameBam, start, end) - if r.flag & 4 == 0] + reads = [r for r in bam.fetch(chrNameBam, start, end) if r.flag & 4 == 0] bam.close() @@ -203,8 +218,7 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): r_index += 1 try: # calculate GC content of read fragment - gc = getReadGCcontent(tbit, read, fragmentLength, - chrNameBit) + gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) except Exception as detail: print(detail) """ this exception happens when the end of a @@ -214,18 +228,23 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): continue # is this read in the same orientation and position as the previous? - if r_index > 0 and read.pos == reads[r_index - 1].pos and \ - read.is_reverse == reads[r_index - 1].is_reverse \ - and read.pnext == reads[r_index - 1].pnext: + if ( + r_index > 0 + and read.pos == reads[r_index - 1].pos + and read.is_reverse == reads[r_index - 1].is_reverse + and read.pnext == reads[r_index - 1].pnext + ): read_repetitions += 1 - if read_repetitions >= global_vars['max_dup_gc'][gc]: + if read_repetitions >= global_vars["max_dup_gc"][gc]: removed_duplicated_reads += 1 continue else: read_repetitions = 0 try: - fragmentStart, fragmentEnd = getFragmentFromRead(read, fragmentLength, extendPairedEnds=True) + fragmentStart, fragmentEnd = getFragmentFromRead( + read, fragmentLength, extendPairedEnds=True + ) vectorStart = max(fragmentStart - start, 0) vectorEnd = min(fragmentEnd - start, end - start) except TypeError: @@ -239,25 +258,33 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step): try: if debug: endTime = time.time() - print("{}, processing {} ({:.1f} per sec) " - "reads @ {}:{}-{}".format(multiprocessing.current_process().name, - i, i / (endTime - startTime), - chrNameBit, start, end)) + print( + "{}, processing {} ({:.1f} per sec) " + "reads @ {}:{}-{}".format( + multiprocessing.current_process().name, + i, + i / (endTime - startTime), + chrNameBit, + start, + end, + ) + ) except NameError: pass if i == 0: return None - _file = open(utilities.getTempFileName(suffix='.bg'), 'w') + _file = open(utilities.getTempFileName(suffix=".bg"), "w") # save in bedgraph format for bin in range(0, len(cvg_corr), step): - value = np.mean(cvg_corr[bin:min(bin + step, end)]) + value = np.mean(cvg_corr[bin : min(bin + step, end)]) if value > 0: writeStart = start + bin writeEnd = min(start + bin + step, end) - _file.write("%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart, - writeEnd, value)) + _file.write( + "%s\t%d\t%d\t%.1f\n" % (chrNameBit, writeStart, writeEnd, value) + ) tempFileName = _file.name _file.close() @@ -288,10 +315,15 @@ def writeCorrectedSam_wrapper(args): return writeCorrectedSam_worker(*args) -def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, - step=None, - tag_but_not_change_number=False, - verbose=True): +def writeCorrectedSam_worker( + chrNameBam, + chrNameBit, + start, + end, + step=None, + tag_but_not_change_number=False, + verbose=True, +): r""" Writes a BAM file, deleting and adding some reads in order to compensate for the GC bias. **This is a stochastic method.** @@ -333,12 +365,12 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, print("Sam for %s %s %s " % (chrNameBit, start, end)) i = 0 - tbit = py2bit.open(global_vars['2bit']) + tbit = py2bit.open(global_vars["2bit"]) - bam = openBam(global_vars['bam']) - tempFileName = utilities.getTempFileName(suffix='.bam') + bam = openBam(global_vars["bam"]) + tempFileName = utilities.getTempFileName(suffix=".bam") - outfile = pysam.Samfile(tempFileName, 'wb', template=bam) + outfile = pysam.Samfile(tempFileName, "wb", template=bam) startTime = time.time() matePairs = {} read_repetitions = 0 @@ -347,8 +379,11 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, # cache data # r.flag & 4 == 0 is to filter unmapped reads that # have a genomic position - reads = [r for r in bam.fetch(chrNameBam, start, end) - if r.pos > start and r.flag & 4 == 0] + reads = [ + r + for r in bam.fetch(chrNameBam, start, end) + if r.pos > start and r.flag & 4 == 0 + ] r_index = -1 for read in reads: @@ -361,26 +396,29 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, # check if a mate has already been procesed # to apply the same correction try: - copies = matePairs[read.qname]['copies'] - gc = matePairs[read.qname]['gc'] - del(matePairs[read.qname]) + copies = matePairs[read.qname]["copies"] + gc = matePairs[read.qname]["gc"] + del matePairs[read.qname] except: # this exception happens when a mate is # not present. This could # happen because of removal of the mate # by some filtering - gc = getReadGCcontent(tbit, read, fragmentLength, - chrNameBit) + gc = getReadGCcontent(tbit, read, fragmentLength, chrNameBit) if gc: copies = numCopiesOfRead(float(1) / R_gc[gc]) else: copies = 1 # is this read in the same orientation and position as the previous? - if gc and r_index > 0 and read.pos == reads[r_index - 1].pos \ - and read.is_reverse == reads[r_index - 1].is_reverse \ - and read.pnext == reads[r_index - 1].pnext: + if ( + gc + and r_index > 0 + and read.pos == reads[r_index - 1].pos + and read.is_reverse == reads[r_index - 1].is_reverse + and read.pnext == reads[r_index - 1].pnext + ): read_repetitions += 1 - if read_repetitions >= global_vars['max_dup_gc'][gc]: + if read_repetitions >= global_vars["max_dup_gc"][gc]: copies = 0 # in other words do not take into account this read removed_duplicated_reads += 1 else: @@ -405,23 +443,23 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, replace_tags = True if gc: - GC = int(100 * np.round(float(gc) / fragmentLength, - decimals=2)) - readTag.append( - ('YC', float(round(float(1) / R_gc[gc], 2)), "f")) - readTag.append(('YN', copies, "i")) + GC = int(100 * np.round(float(gc) / fragmentLength, decimals=2)) + readTag.append(("YC", float(round(float(1) / R_gc[gc], 2)), "f")) + readTag.append(("YN", copies, "i")) else: GC = -1 - readTag.append(('YG', GC, "i")) + readTag.append(("YG", GC, "i")) if replace_tags: read.set_tags(readTag) - if read.is_paired and read.is_proper_pair \ - and not read.mate_is_unmapped \ - and not read.is_reverse: - matePairs[readName] = {'copies': copies, - 'gc': gc} + if ( + read.is_paired + and read.is_proper_pair + and not read.mate_is_unmapped + and not read.is_reverse + ): + matePairs[readName] = {"copies": copies, "gc": gc} """ outfile.write(read) @@ -440,23 +478,40 @@ def writeCorrectedSam_worker(chrNameBam, chrNameBit, start, end, if verbose: if i % 500000 == 0 and i > 0: endTime = time.time() - print("{}, processing {} ({:.1f} per sec) reads " - "@ {}:{}-{}".format(multiprocessing.current_process().name, - i, i / (endTime - startTime), - chrNameBit, start, end)) + print( + "{}, processing {} ({:.1f} per sec) reads " + "@ {}:{}-{}".format( + multiprocessing.current_process().name, + i, + i / (endTime - startTime), + chrNameBit, + start, + end, + ) + ) i += 1 outfile.close() if verbose: endTime = time.time() - print("{}, processing {} ({:.1f} per sec) reads " - "@ {}:{}-{}".format(multiprocessing.current_process().name, - i, i / (endTime - startTime), - chrNameBit, start, end)) - percentage = float(removed_duplicated_reads) * 100 / len(reads) \ - if len(reads) > 0 else 0 - print("duplicated reads removed %d of %d (%.2f) " % - (removed_duplicated_reads, len(reads), percentage)) + print( + "{}, processing {} ({:.1f} per sec) reads " + "@ {}:{}-{}".format( + multiprocessing.current_process().name, + i, + i / (endTime - startTime), + chrNameBit, + start, + end, + ) + ) + percentage = ( + float(removed_duplicated_reads) * 100 / len(reads) if len(reads) > 0 else 0 + ) + print( + "duplicated reads removed %d of %d (%.2f) " + % (removed_duplicated_reads, len(reads), percentage) + ) return tempFileName @@ -542,10 +597,10 @@ def run_shell_command(command): subprocess.check_call(command, shell=True) except subprocess.CalledProcessError as error: - sys.stderr.write('Error{}\n'.format(error)) + sys.stderr.write("Error{}\n".format(error)) exit(1) except Exception as error: - sys.stderr.write('Error: {}\n'.format(error)) + sys.stderr.write("Error: {}\n".format(error)) exit(1) @@ -561,42 +616,47 @@ def main(args=None): global global_vars global_vars = {} - global_vars['2bit'] = args.genome - global_vars['bam'] = args.bamfile + global_vars["2bit"] = args.genome + global_vars["bam"] = args.bamfile # compute the probability to find more than one read (a redundant read) # at a certain position based on the gc of the read fragment # the binomial function is used for that - max_dup_gc = [binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) - if F_gc[x] > 0 and N_gc[x] > 0 else 1 - for x in range(len(F_gc))] + max_dup_gc = [ + binom.isf(1e-7, F_gc[x], 1.0 / N_gc[x]) if F_gc[x] > 0 and N_gc[x] > 0 else 1 + for x in range(len(F_gc)) + ] - global_vars['max_dup_gc'] = max_dup_gc + global_vars["max_dup_gc"] = max_dup_gc - tbit = py2bit.open(global_vars['2bit']) - bam, mapped, unmapped, stats = openBam(args.bamfile, returnStats=True, nThreads=args.numberOfProcessors) + tbit = py2bit.open(global_vars["2bit"]) + bam, mapped, unmapped, stats = openBam( + args.bamfile, returnStats=True, nThreads=args.numberOfProcessors + ) - global_vars['genome_size'] = sum(tbit.chroms().values()) - global_vars['total_reads'] = mapped - global_vars['reads_per_bp'] = \ - float(global_vars['total_reads']) / args.effectiveGenomeSize + global_vars["genome_size"] = sum(tbit.chroms().values()) + global_vars["total_reads"] = mapped + global_vars["reads_per_bp"] = ( + float(global_vars["total_reads"]) / args.effectiveGenomeSize + ) # apply correction print("applying correction") # divide the genome in fragments containing about 4e5 reads. # This amount of reads takes about 20 seconds # to process per core (48 cores, 256 Gb memory) - chunkSize = int(4e5 / global_vars['reads_per_bp']) + chunkSize = int(4e5 / global_vars["reads_per_bp"]) # chromSizes: list of tuples - chromSizes = [(bam.references[i], bam.lengths[i]) - for i in range(len(bam.references))] + chromSizes = [ + (bam.references[i], bam.lengths[i]) for i in range(len(bam.references)) + ] regionStart = 0 if args.region: - chromSizes, regionStart, regionEnd, chunkSize = \ - mapReduce.getUserRegion(chromSizes, args.region, - max_chunk_size=chunkSize) + chromSizes, regionStart, regionEnd, chunkSize = mapReduce.getUserRegion( + chromSizes, args.region, max_chunk_size=chunkSize + ) print("genome partition size for multiprocessing: {}".format(chunkSize)) print("using region {}".format(args.region)) @@ -617,20 +677,21 @@ def main(args=None): print("Reads in this chromosome will be skipped") continue length = min(size, i + chunkSize) - mp_args.append((chrom, chrNameBamToBit[chrom], i, length, - bedGraphStep)) + mp_args.append((chrom, chrNameBamToBit[chrom], i, length, bedGraphStep)) c += 1 pool = multiprocessing.Pool(args.numberOfProcessors) - if args.correctedFile.name.endswith('bam'): + if args.correctedFile.name.endswith("bam"): if len(mp_args) > 1 and args.numberOfProcessors > 1: - print(("using {} processors for {} " - "number of tasks".format(args.numberOfProcessors, - len(mp_args)))) - - res = pool.map_async( - writeCorrectedSam_wrapper, mp_args).get(9999999) + print( + ( + "using {} processors for {} " + "number of tasks".format(args.numberOfProcessors, len(mp_args)) + ) + ) + + res = pool.map_async(writeCorrectedSam_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrectedSam_wrapper, mp_args)) @@ -655,22 +716,19 @@ def main(args=None): for tempFileName in res: os.remove(tempFileName) - if args.correctedFile.name.endswith('bg') or \ - args.correctedFile.name.endswith('bw'): - + if args.correctedFile.name.endswith("bg") or args.correctedFile.name.endswith("bw"): if len(mp_args) > 1 and args.numberOfProcessors > 1: - res = pool.map_async(writeCorrected_wrapper, mp_args).get(9999999) else: res = list(map(writeCorrected_wrapper, mp_args)) oname = args.correctedFile.name args.correctedFile.close() - if oname.endswith('bg'): - f = open(oname, 'wb') + if oname.endswith("bg"): + f = open(oname, "wb") for tempFileName in res: if tempFileName: - shutil.copyfileobj(open(tempFileName, 'rb'), f) + shutil.copyfileobj(open(tempFileName, "rb"), f) os.remove(tempFileName) f.close() else: @@ -678,68 +736,66 @@ def main(args=None): writeBedGraph.bedGraphToBigWig(chromSizes, res, oname) -class Tester(): +class Tester: def __init__(self): import os + self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_corrGC/" self.tbitFile = self.root + "sequence.2bit" self.bamFile = self.root + "test.bam" - self.chrNameBam = '2L' - self.chrNameBit = 'chr2L' + self.chrNameBam = "2L" + self.chrNameBit = "chr2L" bam, mapped, unmapped, stats = openBam(self.bamFile, returnStats=True) tbit = py2bit.open(self.tbitFile) global debug debug = 0 global global_vars - global_vars = {'2bit': self.tbitFile, - 'bam': self.bamFile, - 'filter_out': None, - 'extra_sampling_file': None, - 'max_reads': 5, - 'min_reads': 0, - 'min_reads': 0, - 'reads_per_bp': 0.3, - 'total_reads': mapped, - 'genome_size': sum(tbit.chroms().values())} + global_vars = { + "2bit": self.tbitFile, + "bam": self.bamFile, + "filter_out": None, + "extra_sampling_file": None, + "max_reads": 5, + "min_reads": 0, + "min_reads": 0, + "reads_per_bp": 0.3, + "total_reads": mapped, + "genome_size": sum(tbit.chroms().values()), + } def testWriteCorrectedChunk(self): - """ prepare arguments for test - """ + """prepare arguments for test""" global R_gc, R_gc_min, R_gc_max R_gc = np.loadtxt(self.root + "R_gc_paired.txt") - global_vars['max_dup_gc'] = np.ones(301) + global_vars["max_dup_gc"] = np.ones(301) start = 200 end = 300 bedGraphStep = 25 - return (self.chrNameBam, - self.chrNameBit, start, end, bedGraphStep) + return (self.chrNameBam, self.chrNameBit, start, end, bedGraphStep) def testWriteCorrectedSam(self): - """ prepare arguments for test - """ + """prepare arguments for test""" global R_gc, R_gc_min, R_gc_max R_gc = np.loadtxt(self.root + "R_gc_paired.txt") - global_vars['max_dup_gc'] = np.ones(301) + global_vars["max_dup_gc"] = np.ones(301) start = 200 end = 250 - return (self.chrNameBam, - self.chrNameBit, start, end) + return (self.chrNameBam, self.chrNameBit, start, end) def testWriteCorrectedSam_paired(self): - """ prepare arguments for test. - """ + """prepare arguments for test.""" global R_gc, R_gc_min, R_gc_max R_gc = np.loadtxt(self.root + "R_gc_paired.txt") start = 0 end = 500 global global_vars - global_vars['bam'] = self.root + "paired.bam" - return 'chr2L', 'chr2L', start, end + global_vars["bam"] = self.root + "paired.bam" + return "chr2L", "chr2L", start, end if __name__ == "__main__": diff --git a/deeptools/correlation.py b/deeptools/correlation.py index 18fdbfcfd..9db601b30 100755 --- a/deeptools/correlation.py +++ b/deeptools/correlation.py @@ -5,9 +5,10 @@ import scipy.cluster.hierarchy as sch import scipy.stats import matplotlib as mpl -mpl.use('Agg') -mpl.rcParams['pdf.fonttype'] = 42 -mpl.rcParams['svg.fonttype'] = 'none' + +mpl.use("Agg") +mpl.rcParams["pdf.fonttype"] = 42 +mpl.rcParams["svg.fonttype"] = "none" from deeptools import cm # noqa: F401 import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec @@ -22,7 +23,7 @@ import plotly.figure_factory as ff -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") class Correlation: @@ -33,13 +34,15 @@ class to work with matrices them and make scatter plots """ - def __init__(self, matrix_file, - corr_method=None, - labels=None, - remove_outliers=False, - skip_zeros=False, - log1p=False): - + def __init__( + self, + matrix_file, + corr_method=None, + labels=None, + remove_outliers=False, + skip_zeros=False, + log1p=False, + ): self.load_matrix(matrix_file) self.skip_zeros = skip_zeros self.corr_method = corr_method @@ -84,19 +87,22 @@ def load_matrix(self, matrix_file): _ma = np.load(matrix_file) # matrix: cols correspond to samples - self.matrix = np.asarray(_ma['matrix'].tolist()) + self.matrix = np.asarray(_ma["matrix"].tolist()) if np.any(np.isnan(self.matrix)): num_nam = len(np.flatnonzero(np.isnan(self.matrix.flatten()))) - sys.stderr.write("*Warning*. {} NaN values were found. They will be removed along with the " - "corresponding bins in other samples for the computation " - "and plotting\n".format(num_nam)) + sys.stderr.write( + "*Warning*. {} NaN values were found. They will be removed along with the " + "corresponding bins in other samples for the computation " + "and plotting\n".format(num_nam) + ) self.matrix = np.ma.compress_rows(np.ma.masked_invalid(self.matrix)) - self.labels = list(map(toString, _ma['labels'])) + self.labels = list(map(toString, _ma["labels"])) - assert len(self.labels) == self.matrix.shape[1], "ERROR, length of labels is not equal " \ - "to length of matrix samples" + assert len(self.labels) == self.matrix.shape[1], ( + "ERROR, length of labels is not equal " "to length of matrix samples" + ) @staticmethod def get_outlier_indices(data, max_deviation=200): @@ -148,15 +154,15 @@ def remove_outliers(self, verbose=True): # that's why the intersection is used to_remove = to_remove.intersection(outliers) if len(to_remove): - to_keep = [x for x in range(self.matrix.shape[0]) - if x not in to_remove] + to_keep = [x for x in range(self.matrix.shape[0]) if x not in to_remove] self.matrix = self.matrix[to_keep, :] if verbose: sys.stderr.write( "total/filtered/left: " - "{}/{}/{}\n".format(unfiltered, - unfiltered - len(to_keep), - len(to_keep))) + "{}/{}/{}\n".format( + unfiltered, unfiltered - len(to_keep), len(to_keep) + ) + ) return self.matrix @@ -177,11 +183,10 @@ def save_corr_matrix(self, file_handle): self.labels = [toString(x) for x in self.labels] file_handle.write("\t'" + "'\t'".join(self.labels) + "'\n") - fmt = "\t".join(np.repeat('%.4f', self.corr_matrix.shape[1])) + "\n" + fmt = "\t".join(np.repeat("%.4f", self.corr_matrix.shape[1])) + "\n" i = 0 for row in self.corr_matrix: - file_handle.write( - "'%s'\t" % self.labels[i] + fmt % tuple(row)) + file_handle.write("'%s'\t" % self.labels[i] + fmt % tuple(row)) i += 1 def compute_correlation(self): @@ -218,11 +223,11 @@ def compute_correlation(self): num_samples = len(self.labels) # initialize correlation matrix - if self.corr_method == 'pearson': + if self.corr_method == "pearson": self.corr_matrix = np.ma.corrcoef(self.matrix.T, allow_masked=True) else: - corr_matrix = np.zeros((num_samples, num_samples), dtype='float') + corr_matrix = np.zeros((num_samples, num_samples), dtype="float") # do an all vs all correlation using the # indices of the upper triangle rows, cols = np.triu_indices(num_samples) @@ -230,15 +235,25 @@ def compute_correlation(self): for index in range(len(rows)): row = rows[index] col = cols[index] - corr_matrix[row, col] = scipy.stats.spearmanr(self.matrix[:, row], self.matrix[:, col])[0] + corr_matrix[row, col] = scipy.stats.spearmanr( + self.matrix[:, row], self.matrix[:, col] + )[0] # make the matrix symmetric self.corr_matrix = corr_matrix + np.triu(corr_matrix, 1).T return self.corr_matrix - def plotly_correlation(self, corr_matrix, plot_filename, labels, plot_title='', - vmax=None, vmin=None, plot_numbers=True, - colormap='jet'): + def plotly_correlation( + self, + corr_matrix, + plot_filename, + labels, + plot_title="", + vmax=None, + vmin=None, + plot_numbers=True, + colormap="jet", + ): """plot_correlation, but using plotly""" textElement = [] for row in range(corr_matrix.shape[0]): @@ -247,7 +262,7 @@ def plotly_correlation(self, corr_matrix, plot_filename, labels, plot_title='', if plot_numbers: trow.append("{:0.2f}".format(corr_matrix[row, col])) else: - trow.append('') + trow.append("") textElement.append(trow) zauto = True @@ -255,13 +270,32 @@ def plotly_correlation(self, corr_matrix, plot_filename, labels, plot_title='', zauto = False convertedCmap = convertCmap(colormap) - fig = ff.create_annotated_heatmap(corr_matrix, x=labels, y=labels, colorscale=convertedCmap, showscale=True, zauto=zauto, zmin=vmin, zmax=vmax, annotation_text=textElement) - fig.layout['title'] = plot_title + fig = ff.create_annotated_heatmap( + corr_matrix, + x=labels, + y=labels, + colorscale=convertedCmap, + showscale=True, + zauto=zauto, + zmin=vmin, + zmax=vmax, + annotation_text=textElement, + ) + fig.layout["title"] = plot_title offline.plot(fig, filename=plot_filename, auto_open=False) - def plot_correlation(self, plot_filename, plot_title='', vmax=None, - vmin=None, colormap='jet', image_format=None, - plot_numbers=False, plotWidth=11, plotHeight=9.5): + def plot_correlation( + self, + plot_filename, + plot_title="", + vmax=None, + vmin=None, + colormap="jet", + image_format=None, + plot_numbers=False, + plotWidth=11, + plotHeight=9.5, + ): """ plots a correlation using a symmetric heatmap """ @@ -274,12 +308,12 @@ def plot_correlation(self, plot_filename, plot_title='', vmax=None, font_size = 5 else: font_size = int(14 - 0.25 * num_rows) - mpl.rcParams.update({'font.size': font_size}) + mpl.rcParams.update({"font.size": font_size}) # set the minimum and maximum values if vmax is None: vmax = 1 if vmin is None: - vmin = 0 if corr_matrix .min() >= 0 else -1 + vmin = 0 if corr_matrix.min() >= 0 else -1 # Compute and plot dendrogram. fig = plt.figure(figsize=(plotWidth, plotHeight)) @@ -287,9 +321,10 @@ def plot_correlation(self, plot_filename, plot_title='', vmax=None, axdendro = fig.add_axes([0.02, 0.12, 0.1, 0.66]) axdendro.set_axis_off() - y_var = sch.linkage(corr_matrix, method='complete') - z_var = sch.dendrogram(y_var, orientation='left', - link_color_func=lambda k: 'darkred') + y_var = sch.linkage(corr_matrix, method="complete") + z_var = sch.dendrogram( + y_var, orientation="left", link_color_func=lambda k: "darkred" + ) axdendro.set_xticks([]) axdendro.set_yticks([]) cmap = copy.copy(plt.get_cmap(colormap)) @@ -301,107 +336,135 @@ def plot_correlation(self, plot_filename, plot_title='', vmax=None, # a good contrast between the correlation numbers that are # plotted on black. if plot_numbers: - cmap = pltcolors.LinearSegmentedColormap.from_list(colormap + "clipped", - cmap(np.linspace(0, 0.9, 10))) + cmap = pltcolors.LinearSegmentedColormap.from_list( + colormap + "clipped", cmap(np.linspace(0, 0.9, 10)) + ) - cmap.set_under((0., 0., 1.)) + cmap.set_under((0.0, 0.0, 1.0)) # Plot distance matrix. axmatrix = fig.add_axes([0.13, 0.1, 0.6, 0.7]) - index = z_var['leaves'] + index = z_var["leaves"] corr_matrix = corr_matrix[index, :] corr_matrix = corr_matrix[:, index] if corr_matrix.shape[0] > 30: # when there are too many rows it is better to remove # the black lines surrounding the boxes in the heatmap - edge_color = 'none' + edge_color = "none" else: - edge_color = 'black' + edge_color = "black" if image_format == "plotly": - self.plotly_correlation(corr_matrix, - plot_filename, - self.labels, - plot_title=plot_title, - vmax=vmax, - vmin=vmin, - colormap=colormap, - plot_numbers=plot_numbers) + self.plotly_correlation( + corr_matrix, + plot_filename, + self.labels, + plot_title=plot_title, + vmax=vmax, + vmin=vmin, + colormap=colormap, + plot_numbers=plot_numbers, + ) return - img_mat = axmatrix.pcolormesh(corr_matrix, - edgecolors=edge_color, - cmap=cmap, - vmax=vmax, - vmin=vmin) + img_mat = axmatrix.pcolormesh( + corr_matrix, edgecolors=edge_color, cmap=cmap, vmax=vmax, vmin=vmin + ) axmatrix.set_xlim(0, num_rows) axmatrix.set_ylim(0, num_rows) axmatrix.yaxis.tick_right() - axmatrix.set_yticks(np.arange(corr_matrix .shape[0]) + 0.5) - axmatrix.set_yticklabels(np.array(self.labels).astype('str')[index]) + axmatrix.set_yticks(np.arange(corr_matrix.shape[0]) + 0.5) + axmatrix.set_yticklabels(np.array(self.labels).astype("str")[index]) axmatrix.xaxis.set_tick_params(labeltop=True) axmatrix.xaxis.set_tick_params(labelbottom=False) - axmatrix.set_xticks(np.arange(corr_matrix .shape[0]) + 0.5) - axmatrix.set_xticklabels(np.array(self.labels).astype('str')[index], rotation=45, ha='left') + axmatrix.set_xticks(np.arange(corr_matrix.shape[0]) + 0.5) + axmatrix.set_xticklabels( + np.array(self.labels).astype("str")[index], rotation=45, ha="left" + ) - axmatrix.tick_params( - axis='x', - which='both', - bottom=False, - top=False) + axmatrix.tick_params(axis="x", which="both", bottom=False, top=False) - axmatrix.tick_params( - axis='y', - which='both', - left=False, - right=False) + axmatrix.tick_params(axis="y", which="both", left=False, right=False) # Plot colorbar axcolor = fig.add_axes([0.13, 0.065, 0.6, 0.02]) - cobar = plt.colorbar(img_mat, cax=axcolor, orientation='horizontal') + cobar = plt.colorbar(img_mat, cax=axcolor, orientation="horizontal") cobar.solids.set_edgecolor("face") if plot_numbers: for row in range(num_rows): for col in range(num_rows): - axmatrix.text(row + 0.5, col + 0.5, - "{:.2f}".format(corr_matrix[row, col]), - ha='center', va='center') + axmatrix.text( + row + 0.5, + col + 0.5, + "{:.2f}".format(corr_matrix[row, col]), + ha="center", + va="center", + ) self.column_order = index fig.savefig(plot_filename, format=image_format) plt.close() - def plotly_scatter(self, plot_filename, corr_matrix, plot_title='', minXVal=None, maxXVal=None, minYVal=None, maxYVal=None): + def plotly_scatter( + self, + plot_filename, + corr_matrix, + plot_title="", + minXVal=None, + maxXVal=None, + minYVal=None, + maxYVal=None, + ): """Make the scatter plot of a matrix with plotly""" n = self.matrix.shape[1] self.matrix = self.matrix fig = go.Figure() - domainWidth = 1. / n + domainWidth = 1.0 / n annos = [] for i in range(n): x = domainWidth * (i + 1) y = 1 - (domainWidth * i + 0.5 * domainWidth) - anno = dict(text=self.labels[i], showarrow=False, xref='paper', yref='paper', x=x, y=y, xanchor='right', yanchor='middle') + anno = dict( + text=self.labels[i], + showarrow=False, + xref="paper", + yref="paper", + x=x, + y=y, + xanchor="right", + yanchor="middle", + ) annos.append(anno) data = [] zMin = np.inf zMax = -np.inf for x in range(n): - xanchor = 'x{}'.format(x + 1) + xanchor = "x{}".format(x + 1) base = x * domainWidth domain = [base, base + domainWidth] if x > 0: base = 1 - base - fig['layout']['xaxis{}'.format(x + 1)] = dict(domain=domain, range=[minXVal, maxXVal], anchor='free', position=base) + fig["layout"]["xaxis{}".format(x + 1)] = dict( + domain=domain, + range=[minXVal, maxXVal], + anchor="free", + position=base, + ) for y in range(0, n): - yanchor = 'y{}'.format(y + 1) + yanchor = "y{}".format(y + 1) if x == 1: base = 1 - y * domainWidth domain = [base - domainWidth, base] - fig['layout']['yaxis{}'.format(y + 1)] = dict(domain=domain, range=[minYVal, maxYVal], side='right', anchor='free', position=1.0) + fig["layout"]["yaxis{}".format(y + 1)] = dict( + domain=domain, + range=[minYVal, maxYVal], + side="right", + anchor="free", + position=1.0, + ) if x > y: vector1 = self.matrix[:, x] @@ -412,22 +475,39 @@ def plotly_scatter(self, plot_filename, corr_matrix, plot_title='', minXVal=None zMin = np.min(Z) if np.max(Z) > zMax: zMax = np.max(Z) - name = '{}={:.2f}'.format(self.corr_method, corr_matrix[x, y]) - trace = go.Heatmap(z=Z, x=xEdges, y=yEdges, showlegend=False, xaxis=xanchor, yaxis=yanchor, name=name, showscale=False) + name = "{}={:.2f}".format(self.corr_method, corr_matrix[x, y]) + trace = go.Heatmap( + z=Z, + x=xEdges, + y=yEdges, + showlegend=False, + xaxis=xanchor, + yaxis=yanchor, + name=name, + showscale=False, + ) data.append(trace) # Fix the colorbar bounds for trace in data: trace.update(zmin=zMin, zmax=zMax) - data[-1]['colorbar'].update(title="log10(instances per bin)", titleside="right") + data[-1]["colorbar"].update(title="log10(instances per bin)", titleside="right") data[-1].update(showscale=True) fig.add_traces(data) - fig['layout'].update(title=plot_title, showlegend=False, annotations=annos) + fig["layout"].update(title=plot_title, showlegend=False, annotations=annos) offline.plot(fig, filename=plot_filename, auto_open=False) - def plot_scatter(self, plot_filename, plot_title='', image_format=None, log1p=False, xRange=None, yRange=None): + def plot_scatter( + self, + plot_filename, + plot_title="", + image_format=None, + log1p=False, + xRange=None, + yRange=None, + ): """ Plot the scatter plots of a matrix in which each row is a sample @@ -438,7 +518,7 @@ def plot_scatter(self, plot_filename, plot_title='', image_format=None, log1p=Fa grids = gridspec.GridSpec(num_samples, num_samples) grids.update(wspace=0, hspace=0) fig = plt.figure(figsize=(2 * num_samples, 2 * num_samples)) - plt.rcParams['font.size'] = 8.0 + plt.rcParams["font.size"] = 8.0 plt.suptitle(plot_title) if log1p is True: self.matrix = np.log1p(self.matrix) @@ -452,18 +532,28 @@ def plot_scatter(self, plot_filename, plot_title='', image_format=None, log1p=Fa if yRange is not None: min_yvalue = yRange[0] max_yvalue = yRange[1] - if (min_xvalue % 2 == 0 and max_xvalue % 2 == 0) or \ - (min_xvalue % 1 == 0 and max_xvalue % 2 == 1): + if (min_xvalue % 2 == 0 and max_xvalue % 2 == 0) or ( + min_xvalue % 1 == 0 and max_xvalue % 2 == 1 + ): # make one value odd and the other even max_xvalue += 1 - if (min_yvalue % 2 == 0 and max_yvalue % 2 == 0) or \ - (min_yvalue % 1 == 0 and max_yvalue % 2 == 1): + if (min_yvalue % 2 == 0 and max_yvalue % 2 == 0) or ( + min_yvalue % 1 == 0 and max_yvalue % 2 == 1 + ): # make one value odd and the other even max_yvalue += 1 # plotly output - if image_format == 'plotly': - self.plotly_scatter(plot_filename, corr_matrix, plot_title=plot_title, minXVal=min_xvalue, maxXVal=max_xvalue, minYVal=min_yvalue, maxYVal=max_yvalue) + if image_format == "plotly": + self.plotly_scatter( + plot_filename, + corr_matrix, + plot_title=plot_title, + minXVal=min_xvalue, + maxXVal=max_xvalue, + minYVal=min_yvalue, + maxYVal=max_yvalue, + ) return rows, cols = np.triu_indices(num_samples) @@ -475,11 +565,16 @@ def plot_scatter(self, plot_filename, plot_title='', image_format=None, log1p=Fa # add titles as # empty plot in the diagonal ax = fig.add_subplot(grids[row, col]) - ax.text(0.5, 0.5, self.labels[row], - verticalalignment='center', - horizontalalignment='center', - fontsize=10, fontweight='bold', - transform=ax.transAxes) + ax.text( + 0.5, + 0.5, + self.labels[row], + verticalalignment="center", + horizontalalignment="center", + fontsize=10, + fontweight="bold", + transform=ax.transAxes, + ) ax.set_axis_off() continue @@ -488,42 +583,37 @@ def plot_scatter(self, plot_filename, plot_title='', image_format=None, log1p=Fa vector1 = self.matrix[:, row] vector2 = self.matrix[:, col] - ax.text(0.2, 0.8, "{}={:.2f}".format(self.corr_method, - corr_matrix[row, col]), - horizontalalignment='left', - transform=ax.transAxes) + ax.text( + 0.2, + 0.8, + "{}={:.2f}".format(self.corr_method, corr_matrix[row, col]), + horizontalalignment="left", + transform=ax.transAxes, + ) ax.get_yaxis().set_tick_params( - which='both', - left=False, - right=False, - direction='out') + which="both", left=False, right=False, direction="out" + ) ax.get_xaxis().set_tick_params( - which='both', - top=False, - bottom=False, - direction='out') + which="both", top=False, bottom=False, direction="out" + ) for tick in ax.xaxis.get_major_ticks(): - tick.label.set_rotation('45') + tick.label.set_rotation("45") if col != num_samples - 1: ax.set_yticklabels([]) else: ax.yaxis.tick_right() ax.get_yaxis().set_tick_params( - which='both', - left=False, - right=True, - direction='out') + which="both", left=False, right=True, direction="out" + ) if col - row == 1: ax.xaxis.tick_bottom() ax.get_xaxis().set_tick_params( - which='both', - top=False, - bottom=True, - direction='out') + which="both", top=False, bottom=True, direction="out" + ) for tick in ax.xaxis.get_major_ticks(): - tick.label.set_rotation('45') + tick.label.set_rotation("45") else: ax.set_xticklabels([]) @@ -540,12 +630,40 @@ def plotly_pca(self, plotFile, Wt, pvar, PCs, eigenvalues, cols, plotTitle): A plotly version of plot_pca, that's called by it to do the actual plotting """ fig = go.Figure() - fig['layout']['xaxis1'] = {'domain': [0.0, 0.48], 'anchor': 'x1', 'title': 'PC{} ({:4.1f}% of var. explained)'.format(PCs[0], 100.0 * pvar[PCs[0] - 1])} - fig['layout']['yaxis1'] = {'domain': [0.0, 1.0], 'anchor': 'x1', 'title': 'PC{} ({:4.1f}% of var. explained)'.format(PCs[1], 100.0 * pvar[PCs[1] - 1])} - fig['layout']['xaxis2'] = {'domain': [0.52, 1.0], 'title': 'Principal Component'} - fig['layout']['yaxis2'] = {'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'Eigenvalue', 'rangemode': 'tozero', 'showgrid': False} - fig['layout']['yaxis3'] = {'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'Cumulative variability', 'rangemode': 'tozero', 'side': 'right', 'overlaying': 'y2'} - fig['layout'].update(title=plotTitle) + fig["layout"]["xaxis1"] = { + "domain": [0.0, 0.48], + "anchor": "x1", + "title": "PC{} ({:4.1f}% of var. explained)".format( + PCs[0], 100.0 * pvar[PCs[0] - 1] + ), + } + fig["layout"]["yaxis1"] = { + "domain": [0.0, 1.0], + "anchor": "x1", + "title": "PC{} ({:4.1f}% of var. explained)".format( + PCs[1], 100.0 * pvar[PCs[1] - 1] + ), + } + fig["layout"]["xaxis2"] = { + "domain": [0.52, 1.0], + "title": "Principal Component", + } + fig["layout"]["yaxis2"] = { + "domain": [0.0, 1.0], + "anchor": "x2", + "title": "Eigenvalue", + "rangemode": "tozero", + "showgrid": False, + } + fig["layout"]["yaxis3"] = { + "domain": [0.0, 1.0], + "anchor": "x2", + "title": "Cumulative variability", + "rangemode": "tozero", + "side": "right", + "overlaying": "y2", + } + fig["layout"].update(title=plotTitle) # PCA if cols is not None: @@ -553,47 +671,88 @@ def plotly_pca(self, plotFile, Wt, pvar, PCs, eigenvalues, cols, plotTitle): n = len(self.labels) data = [] for i in range(n): - trace = go.Scatter(x=[Wt[PCs[0] - 1, i]], - y=[Wt[PCs[1] - 1, i]], - mode='marker', - xaxis='x1', - yaxis='y1', - name=self.labels[i]) - trace['marker'].update(size=20) + trace = go.Scatter( + x=[Wt[PCs[0] - 1, i]], + y=[Wt[PCs[1] - 1, i]], + mode="marker", + xaxis="x1", + yaxis="y1", + name=self.labels[i], + ) + trace["marker"].update(size=20) if cols is not None: - trace['marker'].update(color=next(colors)) + trace["marker"].update(color=next(colors)) data.append(trace) # Scree plot - trace = go.Bar(showlegend=False, - name='Eigenvalues', - x=range(1, n + 1), - y=eigenvalues[:n], - xaxis='x2', - yaxis='y2') + trace = go.Bar( + showlegend=False, + name="Eigenvalues", + x=range(1, n + 1), + y=eigenvalues[:n], + xaxis="x2", + yaxis="y2", + ) data.append(trace) # Cumulative variability - trace = go.Scatter(showlegend=False, - x=range(1, n + 1), - y=pvar.cumsum()[:n], - mode='lines+markers', - name='Cumulative variability', - xaxis='x2', - yaxis='y3', - line={'color': 'red'}, - marker={'symbol': 'circle-open-dot', 'color': 'black'}) + trace = go.Scatter( + showlegend=False, + x=range(1, n + 1), + y=pvar.cumsum()[:n], + mode="lines+markers", + name="Cumulative variability", + xaxis="x2", + yaxis="y3", + line={"color": "red"}, + marker={"symbol": "circle-open-dot", "color": "black"}, + ) data.append(trace) annos = [] - annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': 'PCA', 'y': 1.0, 'x': 0.25, 'font': {'size': 16}, 'showarrow': False}) - annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': 'Scree plot', 'y': 1.0, 'x': 0.75, 'font': {'size': 16}, 'showarrow': False}) + annos.append( + { + "yanchor": "bottom", + "xref": "paper", + "xanchor": "center", + "yref": "paper", + "text": "PCA", + "y": 1.0, + "x": 0.25, + "font": {"size": 16}, + "showarrow": False, + } + ) + annos.append( + { + "yanchor": "bottom", + "xref": "paper", + "xanchor": "center", + "yref": "paper", + "text": "Scree plot", + "y": 1.0, + "x": 0.75, + "font": {"size": 16}, + "showarrow": False, + } + ) fig.add_traces(data) - fig['layout']['annotations'] = annos + fig["layout"]["annotations"] = annos offline.plot(fig, filename=plotFile, auto_open=False) - def plot_pca(self, plot_filename=None, PCs=[1, 2], plot_title='', image_format=None, log1p=False, plotWidth=5, plotHeight=10, cols=None, marks=None): + def plot_pca( + self, + plot_filename=None, + PCs=[1, 2], + plot_title="", + image_format=None, + log1p=False, + plotWidth=5, + plotHeight=10, + cols=None, + marks=None, + ): """ Plot the PCA of a matrix @@ -608,8 +767,8 @@ def plot_pca(self, plot_filename=None, PCs=[1, 2], plot_title='', image_format=N m = m[np.nonzero(rvs)[0], :] rvs = rvs[np.nonzero(rvs)[0]] if self.ntop > 0 and m.shape[0] > self.ntop: - m = m[np.argpartition(rvs, -self.ntop)[-self.ntop:], :] - rvs = rvs[np.argpartition(rvs, -self.ntop)[-self.ntop:]] + m = m[np.argpartition(rvs, -self.ntop)[-self.ntop :], :] + rvs = rvs[np.argpartition(rvs, -self.ntop)[-self.ntop :]] # log2 (if requested) if self.log2: @@ -623,11 +782,13 @@ def plot_pca(self, plot_filename=None, PCs=[1, 2], plot_title='', image_format=N m = m.T # Center and scale - m2 = (m - np.mean(m, axis=0)) + m2 = m - np.mean(m, axis=0) m2 /= np.std(m2, axis=0, ddof=1) # Use the unbiased std. dev. # SVD - U, s, Vh = np.linalg.svd(m2, full_matrices=False, compute_uv=True) # Is full_matrices ever needed? + U, s, Vh = np.linalg.svd( + m2, full_matrices=False, compute_uv=True + ) # Is full_matrices ever needed? # % variance, eigenvalues eigenvalues = s**2 @@ -653,8 +814,10 @@ def plot_pca(self, plot_filename=None, PCs=[1, 2], plot_title='', image_format=N if marks is not None: markers = itertools.cycle(marks) - if image_format == 'plotly': - self.plotly_pca(plot_filename, Wt, pvar, PCs, eigenvalues, cols, plot_title) + if image_format == "plotly": + self.plotly_pca( + plot_filename, Wt, pvar, PCs, eigenvalues, cols, plot_title + ) else: ax1.axhline(y=0, color="black", linestyle="dotted", zorder=1) ax1.axvline(x=0, color="black", linestyle="dotted", zorder=2) @@ -663,42 +826,69 @@ def plot_pca(self, plot_filename=None, PCs=[1, 2], plot_title='', image_format=N marker = next(markers) if isinstance(color, np.ndarray): color = pltcolors.to_hex(color, keep_alpha=True) - ax1.scatter(Wt[PCs[0] - 1, i], Wt[PCs[1] - 1, i], - marker=marker, color=color, s=150, label=self.labels[i], zorder=i + 3) - if plot_title == '': - ax1.set_title('PCA') + ax1.scatter( + Wt[PCs[0] - 1, i], + Wt[PCs[1] - 1, i], + marker=marker, + color=color, + s=150, + label=self.labels[i], + zorder=i + 3, + ) + if plot_title == "": + ax1.set_title("PCA") else: ax1.set_title(plot_title) - ax1.set_xlabel('PC{} ({:4.1f}% of var. explained)'.format(PCs[0], 100.0 * pvar[PCs[0] - 1])) - ax1.set_ylabel('PC{} ({:4.1f}% of var. explained)'.format(PCs[1], 100.0 * pvar[PCs[1] - 1])) - lgd = ax1.legend(scatterpoints=1, loc='center left', borderaxespad=0.5, - bbox_to_anchor=(1, 0.5), - prop={'size': 12}, markerscale=0.9) + ax1.set_xlabel( + "PC{} ({:4.1f}% of var. explained)".format( + PCs[0], 100.0 * pvar[PCs[0] - 1] + ) + ) + ax1.set_ylabel( + "PC{} ({:4.1f}% of var. explained)".format( + PCs[1], 100.0 * pvar[PCs[1] - 1] + ) + ) + lgd = ax1.legend( + scatterpoints=1, + loc="center left", + borderaxespad=0.5, + bbox_to_anchor=(1, 0.5), + prop={"size": 12}, + markerscale=0.9, + ) # Scree plot ind = np.arange(n_bars) # the x locations for the groups - width = 0.35 # the width of the bars + width = 0.35 # the width of the bars if mpl.__version__ >= "2.0.0": ax2.bar(2 * width + ind, eigenvalues[:n_bars], width * 2) else: ax2.bar(width + ind, eigenvalues[:n_bars], width * 2) - ax2.set_ylabel('Eigenvalue') - ax2.set_xlabel('Principal Component') - ax2.set_title('Scree plot') + ax2.set_ylabel("Eigenvalue") + ax2.set_xlabel("Principal Component") + ax2.set_title("Scree plot") ax2.set_xticks(ind + width * 2) ax2.set_xticklabels(ind + 1) ax3 = ax2.twinx() ax3.axhline(y=1, color="black", linestyle="dotted") ax3.plot(width * 2 + ind, pvar.cumsum()[:n], "r-") - ax3.plot(width * 2 + ind, pvar.cumsum()[:n], "wo", markeredgecolor="black") + ax3.plot( + width * 2 + ind, pvar.cumsum()[:n], "wo", markeredgecolor="black" + ) ax3.set_ylim([0, 1.05]) - ax3.set_ylabel('Cumulative variability') + ax3.set_ylabel("Cumulative variability") plt.subplots_adjust(top=3.85) plt.tight_layout() - plt.savefig(plot_filename, format=image_format, bbox_extra_artists=(lgd,), bbox_inches='tight') + plt.savefig( + plot_filename, + format=image_format, + bbox_extra_artists=(lgd,), + bbox_inches="tight", + ) plt.close() return Wt, eigenvalues diff --git a/deeptools/correlation_heatmap.py b/deeptools/correlation_heatmap.py index 58dbdfbc0..be11fd694 100644 --- a/deeptools/correlation_heatmap.py +++ b/deeptools/correlation_heatmap.py @@ -1,5 +1,6 @@ from matplotlib import use as mplt_use -mplt_use('Agg') + +mplt_use("Agg") from deeptools import cm # noqa: F401 import matplotlib.pyplot as plt import numpy as np @@ -8,15 +9,22 @@ import matplotlib.colors as pltcolors import copy -rcParams['pdf.fonttype'] = 42 -rcParams['svg.fonttype'] = 'none' -old_settings = np.seterr(all='ignore') - +rcParams["pdf.fonttype"] = 42 +rcParams["svg.fonttype"] = "none" +old_settings = np.seterr(all="ignore") -def plot_correlation(corr_matrix, labels, plotFileName, vmax=None, - vmin=None, colormap='jet', image_format=None, - plot_numbers=False, plot_title=''): +def plot_correlation( + corr_matrix, + labels, + plotFileName, + vmax=None, + vmin=None, + colormap="jet", + image_format=None, + plot_numbers=False, + plot_title="", +): num_rows = corr_matrix.shape[0] # set a font size according to figure length @@ -26,7 +34,7 @@ def plot_correlation(corr_matrix, labels, plotFileName, vmax=None, font_size = 5 else: font_size = int(14 - 0.25 * num_rows) - rcParams.update({'font.size': font_size}) + rcParams.update({"font.size": font_size}) # set the minimum and maximum values if vmax is None: vmax = 1 @@ -39,9 +47,10 @@ def plot_correlation(corr_matrix, labels, plotFileName, vmax=None, plt.suptitle(plot_title) axdendro = fig.add_axes([0.02, 0.12, 0.1, 0.66]) axdendro.set_axis_off() - y_var = sch.linkage(corr_matrix, method='complete') - z_var = sch.dendrogram(y_var, orientation='right', - link_color_func=lambda k: 'darkred') + y_var = sch.linkage(corr_matrix, method="complete") + z_var = sch.dendrogram( + y_var, orientation="right", link_color_func=lambda k: "darkred" + ) axdendro.set_xticks([]) axdendro.set_yticks([]) cmap = copy.copy(plt.get_cmap(colormap)) @@ -53,58 +62,53 @@ def plot_correlation(corr_matrix, labels, plotFileName, vmax=None, # a good contrast between the correlation numbers that are # plotted on black. if plot_numbers: - cmap = pltcolors.LinearSegmentedColormap.from_list(colormap + "clipped", - cmap(np.linspace(0, 0.9, 10))) + cmap = pltcolors.LinearSegmentedColormap.from_list( + colormap + "clipped", cmap(np.linspace(0, 0.9, 10)) + ) - cmap.set_under((0., 0., 1.)) + cmap.set_under((0.0, 0.0, 1.0)) # Plot distance matrix. axmatrix = fig.add_axes([0.13, 0.1, 0.6, 0.7]) - index = z_var['leaves'] + index = z_var["leaves"] corr_matrix = corr_matrix[index, :] corr_matrix = corr_matrix[:, index] - img_mat = axmatrix.pcolormesh(corr_matrix, - edgecolors='black', - cmap=cmap, - vmax=vmax, - vmin=vmin) + img_mat = axmatrix.pcolormesh( + corr_matrix, edgecolors="black", cmap=cmap, vmax=vmax, vmin=vmin + ) axmatrix.set_xlim(0, num_rows) axmatrix.set_ylim(0, num_rows) axmatrix.yaxis.tick_right() axmatrix.set_yticks(np.arange(corr_matrix.shape[0]) + 0.5) - axmatrix.set_yticklabels(np.array(labels).astype('str')[index]) + axmatrix.set_yticklabels(np.array(labels).astype("str")[index]) -# axmatrix.xaxis.set_label_position('top') + # axmatrix.xaxis.set_label_position('top') axmatrix.xaxis.set_tick_params(labeltop=True) axmatrix.xaxis.set_tick_params(labelbottom=False) axmatrix.set_xticks(np.arange(corr_matrix.shape[0]) + 0.5) - axmatrix.set_xticklabels(np.array(labels).astype('str')[index], - rotation=45, - ha='left') + axmatrix.set_xticklabels( + np.array(labels).astype("str")[index], rotation=45, ha="left" + ) - axmatrix.tick_params( - axis='x', - which='both', - bottom=False, - top=False) + axmatrix.tick_params(axis="x", which="both", bottom=False, top=False) - axmatrix.tick_params( - axis='y', - which='both', - left=False, - right=False) + axmatrix.tick_params(axis="y", which="both", left=False, right=False) # axmatrix.set_xticks([]) # Plot colorbar. axcolor = fig.add_axes([0.13, 0.065, 0.6, 0.02]) - cobar = plt.colorbar(img_mat, cax=axcolor, orientation='horizontal') + cobar = plt.colorbar(img_mat, cax=axcolor, orientation="horizontal") cobar.solids.set_edgecolor("face") if plot_numbers: for row in range(num_rows): for col in range(num_rows): - axmatrix.text(row + 0.5, col + 0.5, - "{:.2f}".format(corr_matrix[row, col]), - ha='center', va='center') + axmatrix.text( + row + 0.5, + col + 0.5, + "{:.2f}".format(corr_matrix[row, col]), + ha="center", + va="center", + ) fig.savefig(plotFileName, format=image_format) fig.close() diff --git a/deeptools/countReadsPerBin.py b/deeptools/countReadsPerBin.py index f9fe77cd1..87b283222 100644 --- a/deeptools/countReadsPerBin.py +++ b/deeptools/countReadsPerBin.py @@ -13,7 +13,7 @@ import pyBigWig debug = 0 -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def countReadsInRegions_wrapper(args): @@ -29,7 +29,6 @@ def countReadsInRegions_wrapper(args): class CountReadsPerBin(object): - r"""Collects coverage over multiple bam files using multiprocessing This function collects read counts (coverage) from several bam files and returns @@ -160,28 +159,35 @@ class CountReadsPerBin(object): [0., 1., 1., 2.]]) """ - def __init__(self, bamFilesList, binLength=50, numberOfSamples=None, numberOfProcessors=1, - verbose=False, region=None, - bedFile=None, extendReads=False, - genomeChunkSize=None, - blackListFileName=None, - minMappingQuality=None, - ignoreDuplicates=False, - chrsToSkip=[], - stepSize=None, - center_read=False, - samFlag_include=None, - samFlag_exclude=None, - zerosToNans=False, - skipZeroOverZero=False, - smoothLength=0, - minFragmentLength=0, - maxFragmentLength=0, - out_file_for_raw_data=None, - bed_and_bin=False, - statsList=[], - mappedList=[]): - + def __init__( + self, + bamFilesList, + binLength=50, + numberOfSamples=None, + numberOfProcessors=1, + verbose=False, + region=None, + bedFile=None, + extendReads=False, + genomeChunkSize=None, + blackListFileName=None, + minMappingQuality=None, + ignoreDuplicates=False, + chrsToSkip=[], + stepSize=None, + center_read=False, + samFlag_include=None, + samFlag_exclude=None, + zerosToNans=False, + skipZeroOverZero=False, + smoothLength=0, + minFragmentLength=0, + maxFragmentLength=0, + out_file_for_raw_data=None, + bed_and_bin=False, + statsList=[], + mappedList=[], + ): self.bamFilesList = bamFilesList self.binLength = binLength self.numberOfSamples = numberOfSamples @@ -194,33 +200,48 @@ def __init__(self, bamFilesList, binLength=50, numberOfSamples=None, numberOfPro if extendReads and len(bamFilesList): from deeptools.getFragmentAndReadSize import get_read_and_fragment_length - frag_len_dict, read_len_dict = get_read_and_fragment_length(bamFilesList[0], - return_lengths=False, - blackListFileName=blackListFileName, - numberOfProcessors=numberOfProcessors, - verbose=verbose) + + frag_len_dict, read_len_dict = get_read_and_fragment_length( + bamFilesList[0], + return_lengths=False, + blackListFileName=blackListFileName, + numberOfProcessors=numberOfProcessors, + verbose=verbose, + ) if extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: - self.defaultFragmentLength = int(frag_len_dict['median']) + self.defaultFragmentLength = int(frag_len_dict["median"]) else: - exit("*ERROR*: library is not paired-end. Please provide an extension length.") + exit( + "*ERROR*: library is not paired-end. Please provide an extension length." + ) if verbose: - print(("Fragment length based on paired en data " - "estimated to be {}".format(frag_len_dict['median']))) - - elif extendReads < read_len_dict['median']: - sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). " - "Reads will not be extended.\n".format(int(read_len_dict['median']))) - self.defaultFragmentLength = 'read length' + print( + ( + "Fragment length based on paired en data " + "estimated to be {}".format(frag_len_dict["median"]) + ) + ) + + elif extendReads < read_len_dict["median"]: + sys.stderr.write( + "*WARNING*: read extension is smaller than read length (read length = {}). " + "Reads will not be extended.\n".format(int(read_len_dict["median"])) + ) + self.defaultFragmentLength = "read length" elif extendReads > 2000: - exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(extendReads)) + exit( + "*ERROR*: read extension must be smaller that 2000. Value give: {} ".format( + extendReads + ) + ) else: self.defaultFragmentLength = int(extendReads) else: - self.defaultFragmentLength = 'read length' + self.defaultFragmentLength = "read length" self.numberOfProcessors = numberOfProcessors self.verbose = verbose @@ -247,9 +268,11 @@ def __init__(self, bamFilesList, binLength=50, numberOfSamples=None, numberOfPro # check that wither numberOfSamples or stepSize are set if numberOfSamples is None and stepSize is None and bedFile is None: - raise ValueError("either stepSize, numberOfSamples or bedFile have to be set") + raise ValueError( + "either stepSize, numberOfSamples or bedFile have to be set" + ) - if self.defaultFragmentLength != 'read length': + if self.defaultFragmentLength != "read length": self.maxPairedFragmentLength = 4 * self.defaultFragmentLength else: self.maxPairedFragmentLength = 1000 @@ -259,7 +282,9 @@ def __init__(self, bamFilesList, binLength=50, numberOfSamples=None, numberOfPro if len(self.mappedList) == 0: try: for fname in self.bamFilesList: - bam, mapped, unmapped, stats = bamHandler.openBam(fname, returnStats=True, nThreads=self.numberOfProcessors) + bam, mapped, unmapped, stats = bamHandler.openBam( + fname, returnStats=True, nThreads=self.numberOfProcessors + ) self.mappedList.append(mapped) self.statsList.append(stats) bam.close() @@ -278,13 +303,17 @@ def get_chunk_length(self, bamFilesHandles, genomeSize, chromSizes, chrLengths): else: # compute the step size, based on the number of samples # and the length of the region studied - (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[:3] + (chrom, start, end) = mapReduce.getUserRegion(chromSizes, self.region)[ + :3 + ] self.stepSize = max(int(float(end - start) / self.numberOfSamples), 1) # number of samples is better if large if np.mean(chrLengths) < self.stepSize and self.bedFile is None: min_num_of_samples = int(genomeSize / np.mean(chrLengths)) - raise ValueError("numberOfSamples has to be bigger than {} ".format(min_num_of_samples)) + raise ValueError( + "numberOfSamples has to be bigger than {} ".format(min_num_of_samples) + ) max_mapped = 0 if len(self.mappedList) > 0: @@ -319,7 +348,9 @@ def run(self, allArgs=None): y = pyBigWig.open(x) bamFilesHandles.append(y) - chromsizes, non_common = deeptools.utilities.getCommonChrNames(bamFilesHandles, verbose=self.verbose) + chromsizes, non_common = deeptools.utilities.getCommonChrNames( + bamFilesHandles, verbose=self.verbose + ) # skip chromosome in the list. This is usually for the # X chromosome which may have either one copy in a male sample @@ -336,7 +367,9 @@ def run(self, allArgs=None): chunkSize = None if self.bedFile is None: if self.genomeChunkSize is None: - chunkSize = self.get_chunk_length(bamFilesHandles, genomeSize, chromsizes, chrLengths) + chunkSize = self.get_chunk_length( + bamFilesHandles, genomeSize, chromsizes, chrLengths + ) else: chunkSize = self.genomeChunkSize @@ -350,34 +383,43 @@ def run(self, allArgs=None): self.region += ":{}".format(self.binLength) # Handle GTF options - transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs) + ( + transcriptID, + exonID, + transcript_id_designator, + keepExons, + ) = deeptools.utilities.gtfOptions(allArgs) # use map reduce to call countReadsInRegions_wrapper - imap_res = mapReduce.mapReduce([], - countReadsInRegions_wrapper, - chromsizes, - self_=self, - genomeChunkLength=chunkSize, - bedFile=self.bedFile, - blackListFileName=self.blackListFileName, - region=self.region, - numberOfProcessors=self.numberOfProcessors, - transcriptID=transcriptID, - exonID=exonID, - keepExons=keepExons, - transcript_id_designator=transcript_id_designator) + imap_res = mapReduce.mapReduce( + [], + countReadsInRegions_wrapper, + chromsizes, + self_=self, + genomeChunkLength=chunkSize, + bedFile=self.bedFile, + blackListFileName=self.blackListFileName, + region=self.region, + numberOfProcessors=self.numberOfProcessors, + transcriptID=transcriptID, + exonID=exonID, + keepExons=keepExons, + transcript_id_designator=transcript_id_designator, + ) if self.out_file_for_raw_data: if len(non_common): - sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for " - "the chromosomes that were not common between the bigwig files\n") + sys.stderr.write( + "*Warning*\nThe resulting bed file does not contain information for " + "the chromosomes that were not common between the bigwig files\n" + ) # concatenate intermediary bedgraph files ofile = open(self.out_file_for_raw_data, "w") for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one - _foo = open(tempFileName, 'r') + _foo = open(tempFileName, "r") shutil.copyfileobj(_foo, ofile) _foo.close() os.remove(tempFileName) @@ -390,12 +432,16 @@ def run(self, allArgs=None): except ValueError: if self.bedFile: - sys.exit('\nNo coverage values could be computed.\n\n' - 'Please check that the chromosome names in the BED file are found on the bam files.\n\n' - 'The valid chromosome names are:\n{}'.format(chrNames)) + sys.exit( + "\nNo coverage values could be computed.\n\n" + "Please check that the chromosome names in the BED file are found on the bam files.\n\n" + "The valid chromosome names are:\n{}".format(chrNames) + ) else: - sys.exit('\nNo coverage values could be computed.\n\nCheck that all bam files are valid and ' - 'contain mapped reads.') + sys.exit( + "\nNo coverage values could be computed.\n\nCheck that all bam files are valid and " + "contain mapped reads." + ) def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): """Counts the reads in each bam file at each 'stepSize' position @@ -476,7 +522,9 @@ def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): transcriptsToConsider = [] if bed_regions_list is not None: if self.bed_and_bin: - transcriptsToConsider.append([(x[1][0][0], x[1][0][1], self.binLength) for x in bed_regions_list]) + transcriptsToConsider.append( + [(x[1][0][0], x[1][0][1], self.binLength) for x in bed_regions_list] + ) else: transcriptsToConsider = [x[1] for x in bed_regions_list] else: @@ -486,15 +534,17 @@ def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): for i in range(start, end, self.stepSize): if i + self.binLength > end: break - if blackList is not None and blackList.findOverlaps(chrom, i, i + self.binLength): + if blackList is not None and blackList.findOverlaps( + chrom, i, i + self.binLength + ): continue transcriptsToConsider.append([(i, i + self.binLength)]) if self.save_data: - _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') + _file = open(deeptools.utilities.getTempFileName(suffix=".bed"), "w+t") _file_name = _file.name else: - _file_name = '' + _file_name = "" for bam in bam_handles: for trans in transcriptsToConsider: @@ -504,7 +554,9 @@ def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): else: subnum_reads_per_bin.extend(tcov) - subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape(-1, len(self.bamFilesList), order='F') + subnum_reads_per_bin = np.concatenate([subnum_reads_per_bin]).reshape( + -1, len(self.bamFilesList), order="F" + ) if self.save_data: idx = 0 @@ -513,7 +565,10 @@ def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): starts = ",".join([str(x[0]) for x in trans]) ends = ",".join([str(x[1]) for x in trans]) _file.write("\t".join([chrom, starts, ends]) + "\t") - _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + "\n") + _file.write( + "\t".join(["{}".format(x) for x in subnum_reads_per_bin[i, :]]) + + "\n" + ) else: for exon in trans: for startPos in range(exon[0], exon[1], exon[2]): @@ -521,23 +576,44 @@ def count_reads_in_region(self, chrom, start, end, bed_regions_list=None): # At the end of chromosomes (or due to blacklisted regions), there are bins smaller than the bin size # Counts there are added to the bin before them, but range() will still try to include them. break - _file.write("{0}\t{1}\t{2}\t".format(chrom, startPos, min(startPos + exon[2], exon[1]))) - _file.write("\t".join(["{}".format(x) for x in subnum_reads_per_bin[idx, :]]) + "\n") + _file.write( + "{0}\t{1}\t{2}\t".format( + chrom, startPos, min(startPos + exon[2], exon[1]) + ) + ) + _file.write( + "\t".join( + [ + "{}".format(x) + for x in subnum_reads_per_bin[idx, :] + ] + ) + + "\n" + ) idx += 1 _file.close() if self.verbose: endTime = time.time() rows = subnum_reads_per_bin.shape[0] - print("%s countReadsInRegions_worker: processing %d " - "(%.1f per sec) @ %s:%s-%s" % - (multiprocessing.current_process().name, - rows, rows / (endTime - start_time), chrom, start, end)) + print( + "%s countReadsInRegions_worker: processing %d " + "(%.1f per sec) @ %s:%s-%s" + % ( + multiprocessing.current_process().name, + rows, + rows / (endTime - start_time), + chrom, + start, + end, + ) + ) return subnum_reads_per_bin, _file_name - def get_coverage_of_region(self, bamHandle, chrom, regions, - fragmentFromRead_func=None): + def get_coverage_of_region( + self, bamHandle, chrom, regions, fragmentFromRead_func=None + ): """ Returns a numpy array that corresponds to the number of reads that overlap with each tile. @@ -581,9 +657,9 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, nbins += (reg[1] - reg[0]) // reg[2] if (reg[1] - reg[0]) % reg[2] > 0: nbins += 1 - coverages = np.zeros(nbins, dtype='float64') + coverages = np.zeros(nbins, dtype="float64") - if self.defaultFragmentLength == 'read length': + if self.defaultFragmentLength == "read length": extension = 0 else: extension = self.maxPairedFragmentLength @@ -636,7 +712,10 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, continue # filter reads based on SAM flag - if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include: + if ( + self.samFlag_include + and read.flag & self.samFlag_include != self.samFlag_include + ): continue if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0: continue @@ -660,8 +739,11 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext - if lpos is not None and lpos == read.reference_start \ - and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: + if ( + lpos is not None + and lpos == read.reference_start + and (s, e, read.next_reference_id, read.is_reverse) in prev_pos + ): continue if lpos != read.reference_start: prev_pos.clear() @@ -695,7 +777,10 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentEnd = reg[0] + len(coverages) * tileSize sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0) - eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins) + eIdx = vector_start + min( + np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype("int"), + nRegBins, + ) if last_eIdx is not None: sIdx = max(last_eIdx, sIdx) if sIdx >= eIdx: @@ -709,8 +794,17 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, if self.verbose: endTime = time.time() - print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % ( - multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1])) + print( + "%s, processing %s (%.1f per sec) reads @ %s:%s-%s" + % ( + multiprocessing.current_process().name, + c, + c / (endTime - start_time), + chrom, + reg[0], + reg[1], + ) + ) vector_start += nRegBins @@ -871,7 +965,7 @@ def get_fragment_from_read(self, read): # E.g for a cigar of 40M260N22M # get blocks return two elements for the first 40 matches # and the for the last 22 matches. - if self.defaultFragmentLength == 'read length': + if self.defaultFragmentLength == "read length": return read.get_blocks() else: @@ -896,11 +990,16 @@ def get_fragment_from_read(self, read): if self.center_read: fragmentCenter = fragmentEnd - (fragmentEnd - fragmentStart) / 2 - fragmentStart = int(fragmentCenter - read.infer_query_length(always=False) / 2) + fragmentStart = int( + fragmentCenter - read.infer_query_length(always=False) / 2 + ) fragmentEnd = fragmentStart + read.infer_query_length(always=False) - assert fragmentStart < fragmentEnd, "fragment start greater than fragment" \ - "end for read {}".format(read.query_name) + assert ( + fragmentStart < fragmentEnd + ), "fragment start greater than fragment" "end for read {}".format( + read.query_name + ) return [(fragmentStart, fragmentEnd)] def getSmoothRange(self, tileIndex, tileSize, smoothRange, maxPosition): @@ -988,11 +1087,10 @@ def estimateSizeFactors(m): loggeomeans = np.ma.masked_where(np.isinf(loggeomeans), loggeomeans) # DESeq2 ratio-based size factor sf = np.exp(np.ma.median((np.log(m).T - loggeomeans).T, axis=0)) - return 1. / sf + return 1.0 / sf class Tester(object): - def __init__(self): """ The distribution of reads between the two bam files is as follows. @@ -1014,20 +1112,19 @@ def __init__(self): self.bamFile1 = self.root + "testA.bam" self.bamFile2 = self.root + "testB.bam" self.bamFile_PE = self.root + "test_paired2.bam" - self.chrom = '3R' + self.chrom = "3R" global debug debug = 0 def getRead(self, readType): - """ prepare arguments for test - """ + """prepare arguments for test""" bam = bamHandler.openBam(self.bamFile_PE) - if readType == 'paired-reverse': - read = [x for x in bam.fetch('chr2', 5000081, 5000082)][0] - elif readType == 'single-forward': - read = [x for x in bam.fetch('chr2', 5001491, 5001492)][0] - elif readType == 'single-reverse': - read = [x for x in bam.fetch('chr2', 5001700, 5001701)][0] + if readType == "paired-reverse": + read = [x for x in bam.fetch("chr2", 5000081, 5000082)][0] + elif readType == "single-forward": + read = [x for x in bam.fetch("chr2", 5001491, 5001492)][0] + elif readType == "single-reverse": + read = [x for x in bam.fetch("chr2", 5001700, 5001701)][0] else: # by default a forward paired read is returned - read = [x for x in bam.fetch('chr2', 5000027, 5000028)][0] + read = [x for x in bam.fetch("chr2", 5000027, 5000028)][0] return read diff --git a/deeptools/deepBlue.py b/deeptools/deepBlue.py index 1870b3119..ab83977cb 100644 --- a/deeptools/deepBlue.py +++ b/deeptools/deepBlue.py @@ -64,7 +64,7 @@ def makeTiles(db, args): Given a deepBlue object, return a list of regions that will be queried """ out = [] - for (k, v) in db.chromsTuple: + for k, v in db.chromsTuple: start = 0 while start <= v: end = start + args.binSize @@ -80,7 +80,7 @@ def makeChromTiles(db): Make a region for each chromosome """ out = [] - for (k, v) in db.chromsTuple: + for k, v in db.chromsTuple: out.append([k, 0, v]) return out @@ -91,14 +91,20 @@ def makeRegions(BED, args): These are vaguely extended as appropriate. For simplicity, the maximum of --beforeRegionStartLength and --afterRegionStartLength are tacked on to each end and transcripts are used for GTF files. """ - itree = GTF(BED, transcriptID=args.transcriptID, transcript_id_designator=args.transcript_id_designator) + itree = GTF( + BED, + transcriptID=args.transcriptID, + transcript_id_designator=args.transcript_id_designator, + ) o = [] extend = 0 # The before/after stuff is specific to computeMatrix if "beforeRegionStartLength" in args: extend = max(args.beforeRegionStartLength, args.afterRegionStartLength) for chrom in itree.chroms: - regs = itree.findOverlaps(chrom, 0, 4294967295) # bigWig files use 32 bit coordinates + regs = itree.findOverlaps( + chrom, 0, 4294967295 + ) # bigWig files use 32 bit coordinates for reg in regs: o.append([chrom, max(0, reg[0] - extend), reg[1] + extend]) del itree @@ -116,7 +122,12 @@ def preloadWrapper(foo): class deepBlue(object): - def __init__(self, sample, url="http://deepblue.mpi-inf.mpg.de/xmlrpc", userKey="anonymous_key"): + def __init__( + self, + sample, + url="http://deepblue.mpi-inf.mpg.de/xmlrpc", + userKey="anonymous_key", + ): """ Connect to the requested deepblue server with the given user key and request the specifed sample from it. @@ -137,23 +148,35 @@ def __init__(self, sample, url="http://deepblue.mpi-inf.mpg.de/xmlrpc", userKey= # Set self.experimentID experimentID = self.getEID() if not experimentID: - raise RuntimeError("The requested sample({}) has no associated experiment! If you did not intend to use samples on deepBlue, then it appears either you misspelled a file name or (if you're using BAM files for input) one of your BAM files is lacking a valid index.".format(sample)) + raise RuntimeError( + "The requested sample({}) has no associated experiment! If you did not intend to use samples on deepBlue, then it appears either you misspelled a file name or (if you're using BAM files for input) one of your BAM files is lacking a valid index.".format( + sample + ) + ) # Set self.info (status, resp) = self.server.info(self.experimentID, userKey) if status != "okay": - raise RuntimeError("Received the following error while fetching information about '{}': {}".format(resp, sample)) + raise RuntimeError( + "Received the following error while fetching information about '{}': {}".format( + resp, sample + ) + ) self.info = resp[0] # Set self.genome genome = self.getGenome() if not genome: - raise RuntimeError("Unable to determine an appropriate genome for '{}'".format(sample)) + raise RuntimeError( + "Unable to determine an appropriate genome for '{}'".format(sample) + ) # Set self.chroms chroms = self.getChroms() if not chroms: - raise RuntimeError("Unable to determine chromosome names/sizes for '{}'".format(sample)) + raise RuntimeError( + "Unable to determine chromosome names/sizes for '{}'".format(sample) + ) def getEID(self): """ @@ -163,7 +186,11 @@ def getEID(self): """ (status, resps) = self.server.search(self.sample, "experiments", self.userKey) if status != "okay": - raise RuntimeError("Received an error ({}) while searching for the experiment associated with '{}'".format(resps, self.sample)) + raise RuntimeError( + "Received an error ({}) while searching for the experiment associated with '{}'".format( + resps, self.sample + ) + ) for resp in resps: if resp[1] == self.sample: self.experimentID = resp[0] @@ -188,7 +215,11 @@ def getChroms(self): """ (status, resp) = self.server.chromosomes(self.genome, self.userKey) if status != "okay": - raise RuntimeError("Received an error while fetching chromosome information for '{}': {}".format(self.sample, resp)) + raise RuntimeError( + "Received an error while fetching chromosome information for '{}': {}".format( + self.sample, resp + ) + ) self.chromsDict = {k: v for k, v in resp} self.chromsTuple = [(k, v) for k, v in resp] return resp @@ -234,32 +265,66 @@ def preload(self, regions, tmpDir=None): continue if chrom not in regions2 or len(regions2) == 0: continue - regionsStr = "\n".join(["{}\t{}\t{}".format(k, reg[0], reg[1]) for reg in regions2[chrom]]) + regionsStr = "\n".join( + ["{}\t{}\t{}".format(k, reg[0], reg[1]) for reg in regions2[chrom]] + ) regionsStr += "\n" # Send the regions - (status, regionsID) = self.server.input_regions(self.genome, regionsStr, self.userKey) + (status, regionsID) = self.server.input_regions( + self.genome, regionsStr, self.userKey + ) if status != "okay": - raise RuntimeError("Received the following error while sending regions for '{}': {}".format(regionsID, self.sample)) + raise RuntimeError( + "Received the following error while sending regions for '{}': {}".format( + regionsID, self.sample + ) + ) # Get the experiment information - (status, queryID) = self.server.select_experiments(self.sample, k, None, None, self.userKey) + (status, queryID) = self.server.select_experiments( + self.sample, k, None, None, self.userKey + ) if status != "okay": - raise RuntimeError("Received the following error while running select_experiments on file '{}': {}".format(self.sample, queryID)) + raise RuntimeError( + "Received the following error while running select_experiments on file '{}': {}".format( + self.sample, queryID + ) + ) if not queryID: - raise RuntimeError("Somehow, we received None as a query ID (file '{}')".format(self.sample)) + raise RuntimeError( + "Somehow, we received None as a query ID (file '{}')".format( + self.sample + ) + ) # Intersect - (status, intersectID) = self.server.intersection(queryID, regionsID, self.userKey) + (status, intersectID) = self.server.intersection( + queryID, regionsID, self.userKey + ) if status != "okay": - raise RuntimeError("Received the following error while running intersection on file '{}': {}".format(self.sample, intersectID)) + raise RuntimeError( + "Received the following error while running intersection on file '{}': {}".format( + self.sample, intersectID + ) + ) if not intersectID: - raise RuntimeError("Somehow, we received None as an intersect ID (file '{}')".format(self.sample)) + raise RuntimeError( + "Somehow, we received None as an intersect ID (file '{}')".format( + self.sample + ) + ) # Query the regions - (status, reqID) = self.server.get_regions(intersectID, "START,END,VALUE", self.userKey) + (status, reqID) = self.server.get_regions( + intersectID, "START,END,VALUE", self.userKey + ) if status != "okay": - raise RuntimeError("Received the following error while fetching regions in file '{}': {}".format(self.sample, reqID)) + raise RuntimeError( + "Received the following error while fetching regions in file '{}': {}".format( + self.sample, reqID + ) + ) # Wait for the server to process the data (status, info) = self.server.info(reqID, self.userKey) @@ -272,15 +337,28 @@ def preload(self, regions, tmpDir=None): # Get the actual data (status, resp) = self.server.get_request_data(reqID, self.userKey) if status != "okay": - raise RuntimeError("Received the following error while fetching data in file '{}': {}".format(self.sample, resp)) + raise RuntimeError( + "Received the following error while fetching data in file '{}': {}".format( + self.sample, resp + ) + ) for intervals in resp.split("\n"): interval = intervals.split("\t") - if interval[0] == '': + if interval[0] == "": continue - bw.addEntries([k], [int(interval[0]) - 1], ends=[int(interval[1]) - 1], values=[float(interval[2])]) + bw.addEntries( + [k], + [int(interval[0]) - 1], + ends=[int(interval[1]) - 1], + values=[float(interval[2])], + ) bw.close() - sys.stderr.write("{} done (took {})\n".format(self.sample, datetime.datetime.now() - startTime)) + sys.stderr.write( + "{} done (took {})\n".format( + self.sample, datetime.datetime.now() - startTime + ) + ) sys.stderr.flush() return fname diff --git a/deeptools/deeptools_list_tools.py b/deeptools/deeptools_list_tools.py index 11807b04a..806c70eb0 100644 --- a/deeptools/deeptools_list_tools.py +++ b/deeptools/deeptools_list_tools.py @@ -57,10 +57,12 @@ def parse_arguments(args=None): For more information visit: http://deeptools.readthedocs.org -""") +""", + ) - parser.add_argument('--version', action='version', - version='%(prog)s {}'.format(__version__)) + parser.add_argument( + "--version", action="version", version="%(prog)s {}".format(__version__) + ) return parser diff --git a/deeptools/estimateReadFiltering.py b/deeptools/estimateReadFiltering.py index 464fe0999..3448cae93 100644 --- a/deeptools/estimateReadFiltering.py +++ b/deeptools/estimateReadFiltering.py @@ -31,117 +31,151 @@ def parseArguments(): The sum of these may be more than the total number of reads. Note that alignments are sampled from bins of size --binSize spaced --distanceBetweenBins apart. """, - usage='Example usage: estimateReadFiltering.py -b sample1.bam sample2.bam > log.txt') - - required = parser.add_argument_group('Required arguments') - required.add_argument('--bamfiles', '-b', - metavar='FILE1 FILE2', - help='List of indexed bam files separated by spaces.', - nargs='+', - required=True) - - general = parser.add_argument_group('General arguments') - - general.add_argument('--outFile', '-o', - type=parserCommon.writableFile, - help='The file to write results to. By default, results are printed to the console') - - general.add_argument('--sampleLabels', - help='Labels for the samples. The ' - 'default is to use the file name of the ' - 'sample. The sample labels should be separated ' - 'by spaces and quoted if a label itself' - 'contains a space E.g. --sampleLabels label-1 "label 2" ', - nargs='+') - - general.add_argument('--smartLabels', - action='store_true', - help='Instead of manually specifying labels for the input ' - 'BAM files, this causes deepTools to use the ' - 'file name after removing the path and extension.') - - general.add_argument('--binSize', '-bs', - metavar='INT', - help='Length in bases of the window used to sample the genome. (Default: %(default)s)', - default=1000000, - type=int) - - general.add_argument('--distanceBetweenBins', '-n', - metavar='INT', - help='To reduce the computation time, not every possible genomic ' - 'bin is sampled. This option allows you to set the distance ' - 'between bins actually sampled from. Larger numbers are sufficient ' - 'for high coverage samples, while smaller values are useful for ' - 'lower coverage samples. Note that if you specify a value that ' - 'results in too few (<1000) reads sampled, the value will be ' - 'decreased. (Default: %(default)s)', - default=10000, - type=int) - - general.add_argument('--numberOfProcessors', '-p', - help='Number of processors to use. Type "max/2" to ' - 'use half the maximum number of processors or "max" ' - 'to use all available processors. (Default: %(default)s)', - metavar="INT", - type=parserCommon.numberOfProcessors, - default=1, - required=False) - - general.add_argument('--verbose', '-v', - help='Set to see processing messages.', - action='store_true') - - general.add_argument('--version', action='version', - version='%(prog)s {}'.format(__version__)) - - filtering = parser.add_argument_group('Optional arguments') - - filtering.add_argument('--filterRNAstrand', - help='Selects RNA-seq reads (single-end or paired-end) in ' - 'the given strand. (Default: %(default)s)', - choices=['forward', 'reverse'], - default=None) - - filtering.add_argument('--ignoreDuplicates', - help='If set, reads that have the same orientation ' - 'and start position will be considered only ' - 'once. If reads are paired, the mate\'s position ' - 'also has to coincide to ignore a read.', - action='store_true') - - filtering.add_argument('--minMappingQuality', - metavar='INT', - help='If set, only reads that have a mapping ' - 'quality score of at least this are ' - 'considered.', - type=int) - - filtering.add_argument('--samFlagInclude', - help='Include reads based on the SAM flag. For example, ' - 'to get only reads that are the first mate, use a flag of 64. ' - 'This is useful to count properly paired reads only once, ' - 'as otherwise the second mate will be also considered for the ' - 'coverage. (Default: %(default)s)', - metavar='INT', - default=None, - type=int, - required=False) - - filtering.add_argument('--samFlagExclude', - help='Exclude reads based on the SAM flag. For example, ' - 'to get only reads that map to the forward strand, use ' - '--samFlagExclude 16, where 16 is the SAM flag for reads ' - 'that map to the reverse strand. (Default: %(default)s)', - metavar='INT', - default=None, - type=int, - required=False) - - filtering.add_argument('--blackListFileName', '-bl', - help="A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.", - metavar="BED file", - nargs="+", - required=False) + usage="Example usage: estimateReadFiltering.py -b sample1.bam sample2.bam > log.txt", + ) + + required = parser.add_argument_group("Required arguments") + required.add_argument( + "--bamfiles", + "-b", + metavar="FILE1 FILE2", + help="List of indexed bam files separated by spaces.", + nargs="+", + required=True, + ) + + general = parser.add_argument_group("General arguments") + + general.add_argument( + "--outFile", + "-o", + type=parserCommon.writableFile, + help="The file to write results to. By default, results are printed to the console", + ) + + general.add_argument( + "--sampleLabels", + help="Labels for the samples. The " + "default is to use the file name of the " + "sample. The sample labels should be separated " + "by spaces and quoted if a label itself" + 'contains a space E.g. --sampleLabels label-1 "label 2" ', + nargs="+", + ) + + general.add_argument( + "--smartLabels", + action="store_true", + help="Instead of manually specifying labels for the input " + "BAM files, this causes deepTools to use the " + "file name after removing the path and extension.", + ) + + general.add_argument( + "--binSize", + "-bs", + metavar="INT", + help="Length in bases of the window used to sample the genome. (Default: %(default)s)", + default=1000000, + type=int, + ) + + general.add_argument( + "--distanceBetweenBins", + "-n", + metavar="INT", + help="To reduce the computation time, not every possible genomic " + "bin is sampled. This option allows you to set the distance " + "between bins actually sampled from. Larger numbers are sufficient " + "for high coverage samples, while smaller values are useful for " + "lower coverage samples. Note that if you specify a value that " + "results in too few (<1000) reads sampled, the value will be " + "decreased. (Default: %(default)s)", + default=10000, + type=int, + ) + + general.add_argument( + "--numberOfProcessors", + "-p", + help='Number of processors to use. Type "max/2" to ' + 'use half the maximum number of processors or "max" ' + "to use all available processors. (Default: %(default)s)", + metavar="INT", + type=parserCommon.numberOfProcessors, + default=1, + required=False, + ) + + general.add_argument( + "--verbose", "-v", help="Set to see processing messages.", action="store_true" + ) + + general.add_argument( + "--version", action="version", version="%(prog)s {}".format(__version__) + ) + + filtering = parser.add_argument_group("Optional arguments") + + filtering.add_argument( + "--filterRNAstrand", + help="Selects RNA-seq reads (single-end or paired-end) in " + "the given strand. (Default: %(default)s)", + choices=["forward", "reverse"], + default=None, + ) + + filtering.add_argument( + "--ignoreDuplicates", + help="If set, reads that have the same orientation " + "and start position will be considered only " + "once. If reads are paired, the mate's position " + "also has to coincide to ignore a read.", + action="store_true", + ) + + filtering.add_argument( + "--minMappingQuality", + metavar="INT", + help="If set, only reads that have a mapping " + "quality score of at least this are " + "considered.", + type=int, + ) + + filtering.add_argument( + "--samFlagInclude", + help="Include reads based on the SAM flag. For example, " + "to get only reads that are the first mate, use a flag of 64. " + "This is useful to count properly paired reads only once, " + "as otherwise the second mate will be also considered for the " + "coverage. (Default: %(default)s)", + metavar="INT", + default=None, + type=int, + required=False, + ) + + filtering.add_argument( + "--samFlagExclude", + help="Exclude reads based on the SAM flag. For example, " + "to get only reads that map to the forward strand, use " + "--samFlagExclude 16, where 16 is the SAM flag for reads " + "that map to the reverse strand. (Default: %(default)s)", + metavar="INT", + default=None, + type=int, + required=False, + ) + + filtering.add_argument( + "--blackListFileName", + "-bl", + help="A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.", + metavar="BED file", + nargs="+", + required=False, + ) return parser @@ -183,7 +217,10 @@ def getFiltered_worker(arglist): if args.minMappingQuality and read.mapq < args.minMappingQuality: filtered = 1 minMapq += 1 - if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: + if ( + args.samFlagInclude + and read.flag & args.samFlagInclude != args.samFlagInclude + ): filtered = 1 samFlagInclude += 1 if args.samFlagExclude and read.flag & args.samFlagExclude != 0: @@ -199,8 +236,11 @@ def getFiltered_worker(arglist): e = s - read.tlen if read.reference_id != read.next_reference_id: e = read.pnext - if lpos is not None and lpos == read.reference_start \ - and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: + if ( + lpos is not None + and lpos == read.reference_start + and (s, e, read.next_reference_id, read.is_reverse) in prev_pos + ): filtered = 1 internalDupes += 1 if lpos != read.reference_start: @@ -217,26 +257,26 @@ def getFiltered_worker(arglist): # filterRNAstrand if args.filterRNAstrand: if read.is_paired: - if args.filterRNAstrand == 'forward': + if args.filterRNAstrand == "forward": if read.flag & 144 == 128 or read.flag & 96 == 64: pass else: filtered = 1 filterRNAstrand += 1 - elif args.filterRNAstrand == 'reverse': + elif args.filterRNAstrand == "reverse": if read.flag & 144 == 144 or read.flag & 96 == 96: pass else: filtered = 1 filterRNAstrand += 1 else: - if args.filterRNAstrand == 'forward': + if args.filterRNAstrand == "forward": if read.flag & 16 == 16: pass else: filtered = 1 filterRNAstrand += 1 - elif args.filterRNAstrand == 'reverse': + elif args.filterRNAstrand == "reverse": if read.flag & 16 == 0: pass else: @@ -248,7 +288,17 @@ def getFiltered_worker(arglist): fh.close() # Append a tuple to the output - tup = (total, nFiltered, minMapq, samFlagInclude, samFlagExclude, internalDupes, externalDupes, singletons, filterRNAstrand) + tup = ( + total, + nFiltered, + minMapq, + samFlagInclude, + samFlagExclude, + internalDupes, + externalDupes, + singletons, + filterRNAstrand, + ) o.append(tup) return o @@ -260,7 +310,9 @@ def main(args=None): args.sampleLabels = smartLabels(args.bamfiles) if args.sampleLabels and len(args.sampleLabels) != len(args.bamfiles): - sys.stderr.write("\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n") + sys.stderr.write( + "\nError: --sampleLabels specified but it doesn't match the number of BAM files!\n" + ) sys.exit(1) if args.outFile is None: @@ -268,7 +320,10 @@ def main(args=None): else: of = open(args.outFile, "w") - bhs = [bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) for x in args.bamfiles] + bhs = [ + bamHandler.openBam(x, returnStats=True, nThreads=args.numberOfProcessors) + for x in args.bamfiles + ] mapped = [x[1] for x in bhs] unmappedList = [x[2] for x in bhs] bhs = [x[0] for x in bhs] @@ -277,7 +332,11 @@ def main(args=None): if args.blackListFileName: blacklisted = [] for bh in bhs: - blacklisted.append(utilities.bam_blacklisted_reads(bh, None, args.blackListFileName, args.numberOfProcessors)) + blacklisted.append( + utilities.bam_blacklisted_reads( + bh, None, args.blackListFileName, args.numberOfProcessors + ) + ) else: blacklisted = [0] * len(bhs) @@ -289,13 +348,15 @@ def main(args=None): x.close() # Get the remaining metrics - res = mapReduce([args], - getFiltered_worker, - chrom_sizes, - genomeChunkLength=args.binSize + args.distanceBetweenBins, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose) + res = mapReduce( + [args], + getFiltered_worker, + chrom_sizes, + genomeChunkLength=args.binSize + args.distanceBetweenBins, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + ) totals = [0] * len(args.bamfiles) nFiltered = [0] * len(args.bamfiles) @@ -319,7 +380,9 @@ def main(args=None): rnaStrand[idx] += r[8] # Print some output - of.write("Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n") + of.write( + "Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n" + ) for idx, _ in enumerate(args.bamfiles): if args.sampleLabels: of.write(args.sampleLabels[idx]) @@ -329,7 +392,10 @@ def main(args=None): # nFiltered metric = 0.0 if totals[idx] > 0: - metric = blacklisted[idx] + float(nFiltered[idx]) / float(totals[idx]) * mapped[idx] + metric = ( + blacklisted[idx] + + float(nFiltered[idx]) / float(totals[idx]) * mapped[idx] + ) of.write("\t{}".format(min(round(metric, 1), mapped[idx]))) # MAPQ metric = 0.0 diff --git a/deeptools/getFragmentAndReadSize.py b/deeptools/getFragmentAndReadSize.py index 427d5308c..30d97f647 100644 --- a/deeptools/getFragmentAndReadSize.py +++ b/deeptools/getFragmentAndReadSize.py @@ -4,7 +4,7 @@ from deeptools import bamHandler from deeptools import mapReduce -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def getFragmentLength_wrapper(args): @@ -38,15 +38,24 @@ def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins): bam = bamHandler.openBam(bamFile) end = max(start + 1, end - distanceBetweenBins) if chrom in bam.references: - reads = np.array([(abs(r.template_length), r.infer_query_length(always=False)) - for r in bam.fetch(chrom, start, end) - if r.is_proper_pair and r.is_read1 and not r.is_unmapped]) + reads = np.array( + [ + (abs(r.template_length), r.infer_query_length(always=False)) + for r in bam.fetch(chrom, start, end) + if r.is_proper_pair and r.is_read1 and not r.is_unmapped + ] + ) if not len(reads): # if the previous operation produces an empty list # it could be that the data is not paired, then # we try with out filtering - reads = np.array([(abs(r.template_length), r.infer_query_length(always=False)) - for r in bam.fetch(chrom, start, end) if not r.is_unmapped]) + reads = np.array( + [ + (abs(r.template_length), r.infer_query_length(always=False)) + for r in bam.fetch(chrom, start, end) + if not r.is_unmapped + ] + ) else: raise NameError("chromosome {} not found in bam file".format(chrom)) @@ -56,9 +65,15 @@ def getFragmentLength_worker(chrom, start, end, bamFile, distanceBetweenBins): return reads -def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileName=None, - binSize=50000, distanceBetweenBins=1000000, - numberOfProcessors=None, verbose=False): +def get_read_and_fragment_length( + bamFile, + return_lengths=False, + blackListFileName=None, + binSize=50000, + distanceBetweenBins=1000000, + numberOfProcessors=None, + verbose=False, +): """ Estimates the fragment length and read length through sampling @@ -87,26 +102,30 @@ def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileNam # Fix issue #522, allow distanceBetweenBins == 0 if distanceBetweenBins == 0: - imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins), - getFragmentLength_wrapper, - chrom_sizes, - genomeChunkLength=binSize, - blackListFileName=blackListFileName, - numberOfProcessors=numberOfProcessors, - verbose=verbose) + imap_res = mapReduce.mapReduce( + (bam_handle.filename, distanceBetweenBins), + getFragmentLength_wrapper, + chrom_sizes, + genomeChunkLength=binSize, + blackListFileName=blackListFileName, + numberOfProcessors=numberOfProcessors, + verbose=verbose, + ) fl = np.concatenate(imap_res) # Try to ensure we have at least 1000 regions from which to compute statistics, halving the intra-bin distance as needed while len(fl) < 1000 and distanceBetweenBins > 1: distanceBetweenBins /= 2 stepsize = binSize + distanceBetweenBins - imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins), - getFragmentLength_wrapper, - chrom_sizes, - genomeChunkLength=stepsize, - blackListFileName=blackListFileName, - numberOfProcessors=numberOfProcessors, - verbose=verbose) + imap_res = mapReduce.mapReduce( + (bam_handle.filename, distanceBetweenBins), + getFragmentLength_wrapper, + chrom_sizes, + genomeChunkLength=stepsize, + blackListFileName=blackListFileName, + numberOfProcessors=numberOfProcessors, + verbose=verbose, + ) fl = np.concatenate(imap_res) @@ -114,50 +133,54 @@ def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileNam fragment_length = fl[:, 0] read_length = fl[:, 1] if fragment_length.mean() > 0: - fragment_len_dict = {'sample_size': len(fragment_length), - 'min': fragment_length.min(), - 'qtile25': np.percentile(fragment_length, 25), - 'mean': np.mean(fragment_length), - 'median': np.median(fragment_length), - 'qtile75': np.percentile(fragment_length, 75), - 'max': fragment_length.max(), - 'std': np.std(fragment_length), - 'mad': np.median(np.abs(fragment_length - np.median(fragment_length))), - 'qtile10': np.percentile(fragment_length, 10), - 'qtile20': np.percentile(fragment_length, 20), - 'qtile30': np.percentile(fragment_length, 30), - 'qtile40': np.percentile(fragment_length, 40), - 'qtile60': np.percentile(fragment_length, 60), - 'qtile70': np.percentile(fragment_length, 70), - 'qtile80': np.percentile(fragment_length, 80), - 'qtile90': np.percentile(fragment_length, 90), - 'qtile99': np.percentile(fragment_length, 99)} + fragment_len_dict = { + "sample_size": len(fragment_length), + "min": fragment_length.min(), + "qtile25": np.percentile(fragment_length, 25), + "mean": np.mean(fragment_length), + "median": np.median(fragment_length), + "qtile75": np.percentile(fragment_length, 75), + "max": fragment_length.max(), + "std": np.std(fragment_length), + "mad": np.median(np.abs(fragment_length - np.median(fragment_length))), + "qtile10": np.percentile(fragment_length, 10), + "qtile20": np.percentile(fragment_length, 20), + "qtile30": np.percentile(fragment_length, 30), + "qtile40": np.percentile(fragment_length, 40), + "qtile60": np.percentile(fragment_length, 60), + "qtile70": np.percentile(fragment_length, 70), + "qtile80": np.percentile(fragment_length, 80), + "qtile90": np.percentile(fragment_length, 90), + "qtile99": np.percentile(fragment_length, 99), + } else: fragment_len_dict = None if return_lengths and fragment_len_dict is not None: - fragment_len_dict['lengths'] = fragment_length - - read_len_dict = {'sample_size': len(read_length), - 'min': read_length.min(), - 'qtile25': np.percentile(read_length, 25), - 'mean': np.mean(read_length), - 'median': np.median(read_length), - 'qtile75': np.percentile(read_length, 75), - 'max': read_length.max(), - 'std': np.std(read_length), - 'mad': np.median(np.abs(read_length - np.median(read_length))), - 'qtile10': np.percentile(read_length, 10), - 'qtile20': np.percentile(read_length, 20), - 'qtile30': np.percentile(read_length, 30), - 'qtile40': np.percentile(read_length, 40), - 'qtile60': np.percentile(read_length, 60), - 'qtile70': np.percentile(read_length, 70), - 'qtile80': np.percentile(read_length, 80), - 'qtile90': np.percentile(read_length, 90), - 'qtile99': np.percentile(read_length, 99)} + fragment_len_dict["lengths"] = fragment_length + + read_len_dict = { + "sample_size": len(read_length), + "min": read_length.min(), + "qtile25": np.percentile(read_length, 25), + "mean": np.mean(read_length), + "median": np.median(read_length), + "qtile75": np.percentile(read_length, 75), + "max": read_length.max(), + "std": np.std(read_length), + "mad": np.median(np.abs(read_length - np.median(read_length))), + "qtile10": np.percentile(read_length, 10), + "qtile20": np.percentile(read_length, 20), + "qtile30": np.percentile(read_length, 30), + "qtile40": np.percentile(read_length, 40), + "qtile60": np.percentile(read_length, 60), + "qtile70": np.percentile(read_length, 70), + "qtile80": np.percentile(read_length, 80), + "qtile90": np.percentile(read_length, 90), + "qtile99": np.percentile(read_length, 99), + } if return_lengths: - read_len_dict['lengths'] = read_length + read_len_dict["lengths"] = read_length else: fragment_len_dict = None read_len_dict = None diff --git a/deeptools/getRatio.py b/deeptools/getRatio.py index 937cc7c41..12a1157fc 100644 --- a/deeptools/getRatio.py +++ b/deeptools/getRatio.py @@ -1,17 +1,17 @@ import numpy as np -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def compute_ratio(value1, value2, args): - value1 = value1 + args['pseudocount'][0] - value2 = value2 + args['pseudocount'][1] + value1 = value1 + args["pseudocount"][0] + value2 = value2 + args["pseudocount"][1] ratio = float(value1) / value2 - if args['valueType'] == 'log2': + if args["valueType"] == "log2": ratio = np.log2(ratio) - elif args['valueType'] == 'reciprocal_ratio': + elif args["valueType"] == "reciprocal_ratio": # the reciprocal ratio of a/b # is a/b if a/b > 1 else -1* b/a ratio = ratio if ratio >= 1 else -1.0 / ratio @@ -54,8 +54,8 @@ def getRatio(tileCoverage, args): 1.0 """ - value1 = args['scaleFactors'][0] * tileCoverage[0] - value2 = args['scaleFactors'][1] * tileCoverage[1] + value1 = args["scaleFactors"][0] * tileCoverage[0] + value2 = args["scaleFactors"][1] * tileCoverage[1] # if any of the two values to compare # is nan, return nan @@ -63,20 +63,20 @@ def getRatio(tileCoverage, args): return np.nan # ratio case - if args['valueType'] in ['ratio', 'log2', 'reciprocal_ratio']: + if args["valueType"] in ["ratio", "log2", "reciprocal_ratio"]: bin_value = compute_ratio(value1, value2, args) # non ratio case (diff, sum etc) else: - if args['valueType'] == 'subtract': + if args["valueType"] == "subtract": bin_value = value1 - value2 - elif args['valueType'] == 'add': + elif args["valueType"] == "add": bin_value = value1 + value2 - elif args['valueType'] == 'first': + elif args["valueType"] == "first": bin_value = value1 - elif args['valueType'] == 'second': + elif args["valueType"] == "second": bin_value = value2 - elif args['valueType'] == 'mean': + elif args["valueType"] == "mean": bin_value = (value1 + value2) / 2.0 return bin_value diff --git a/deeptools/getScaleFactor.py b/deeptools/getScaleFactor.py index 541b4febd..289263b00 100644 --- a/deeptools/getScaleFactor.py +++ b/deeptools/getScaleFactor.py @@ -41,7 +41,10 @@ def getFractionKept_worker(chrom, start, end, bamFile, args, offset): continue # filter reads based on SAM flag - if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: + if ( + args.samFlagInclude + and read.flag & args.samFlagInclude != args.samFlagInclude + ): filtered += 1 continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: @@ -69,8 +72,11 @@ def getFractionKept_worker(chrom, start, end, bamFile, args, offset): e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext - if lpos is not None and lpos == read.reference_start \ - and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: + if ( + lpos is not None + and lpos == read.reference_start + and (s, e, read.next_reference_id, read.is_reverse) in prev_pos + ): filtered += 1 continue if lpos != read.reference_start: @@ -82,19 +88,22 @@ def getFractionKept_worker(chrom, start, end, bamFile, args, offset): # This is very similar to what's used in the get_fragment_from_read function in the filterRnaStrand class if hasattr(args, "filterRNAstrand"): if read.is_paired: - if args.filterRNAstrand == 'forward': - if not ((read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0)): + if args.filterRNAstrand == "forward": + if not ( + (read.flag & 128 == 128 and read.flag & 16 == 0) + or (read.flag & 64 == 64 and read.flag & 32 == 0) + ): filtered += 1 continue - elif args.filterRNAstrand == 'reverse': + elif args.filterRNAstrand == "reverse": if not (read.flag & 144 == 144 or read.flag & 96 == 96): filtered += 1 continue else: - if args.filterRNAstrand == 'forward' and read.flag & 16 == 0: + if args.filterRNAstrand == "forward" and read.flag & 16 == 0: filtered += 1 continue - elif args.filterRNAstrand == 'reverse' and read.flag & 16 == 16: + elif args.filterRNAstrand == "reverse" and read.flag & 16 == 16: filtered += 1 continue @@ -123,11 +132,13 @@ def fraction_kept(args, stats): size is halved. """ # Do we even need to proceed? - if (not args.minMappingQuality or args.minMappingQuality == 0) and \ - (not args.samFlagInclude or args.samFlagInclude == 0) and \ - (not args.samFlagExclude or args.samFlagExclude == 0) and \ - (not args.minFragmentLength or args.minFragmentLength == 0) and \ - (not args.maxFragmentLength or args.maxFragmentLength == 0): + if ( + (not args.minMappingQuality or args.minMappingQuality == 0) + and (not args.samFlagInclude or args.samFlagInclude == 0) + and (not args.samFlagExclude or args.samFlagExclude == 0) + and (not args.minFragmentLength or args.minFragmentLength == 0) + and (not args.maxFragmentLength or args.maxFragmentLength == 0) + ): if hasattr(args, "filterRNAstrand"): if args.filterRNAstrand not in ["forward", "reverse"]: return 1.0 @@ -138,7 +149,9 @@ def fraction_kept(args, stats): total = 0 distanceBetweenBins = 2000000 bam_handle = bamHandler.openBam(args.bam) - bam_mapped = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats) + bam_mapped = utilities.bam_total_reads( + bam_handle, args.ignoreForNormalization, stats + ) if bam_mapped < 1000000: num_needed_to_sample = bam_mapped else: @@ -151,21 +164,28 @@ def fraction_kept(args, stats): if num_needed_to_sample == bam_mapped: distanceBetweenBins = 55000 if args.ignoreForNormalization: - chrom_sizes = [(chrom_name, bam_handle.lengths[idx]) for idx, chrom_name in enumerate(bam_handle.references) - if chrom_name not in args.ignoreForNormalization] + chrom_sizes = [ + (chrom_name, bam_handle.lengths[idx]) + for idx, chrom_name in enumerate(bam_handle.references) + if chrom_name not in args.ignoreForNormalization + ] else: chrom_sizes = list(zip(bam_handle.references, bam_handle.lengths)) offset = 0 # Iterate over bins at various non-overlapping offsets until we have enough data - while total < num_needed_to_sample and offset < np.ceil(distanceBetweenBins / 50000): - res = mapReduce.mapReduce((bam_handle.filename, args, offset), - getFractionKept_wrapper, - chrom_sizes, - genomeChunkLength=distanceBetweenBins, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose) + while total < num_needed_to_sample and offset < np.ceil( + distanceBetweenBins / 50000 + ): + res = mapReduce.mapReduce( + (bam_handle.filename, args, offset), + getFractionKept_wrapper, + chrom_sizes, + genomeChunkLength=distanceBetweenBins, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + ) if len(res): foo, bar = np.sum(res, axis=0) @@ -189,23 +209,35 @@ def get_num_kept_reads(args, stats): :return: integer """ if stats is None: - bam_handle, mapped, unmapped, stats = bamHandler.openBam(args.bam, returnStats=True, nThreads=args.numberOfProcessors) + bam_handle, mapped, unmapped, stats = bamHandler.openBam( + args.bam, returnStats=True, nThreads=args.numberOfProcessors + ) else: bam_handle = bamHandler.openBam(args.bam) - bam_mapped_total = utilities.bam_total_reads(bam_handle, args.ignoreForNormalization, stats) + bam_mapped_total = utilities.bam_total_reads( + bam_handle, args.ignoreForNormalization, stats + ) if args.blackListFileName: - blacklisted = utilities.bam_blacklisted_reads(bam_handle, args.ignoreForNormalization, - args.blackListFileName, args.numberOfProcessors) - print("There are {0} alignments, of which {1} are completely " - "within a blacklist region.".format(bam_mapped_total, blacklisted)) + blacklisted = utilities.bam_blacklisted_reads( + bam_handle, + args.ignoreForNormalization, + args.blackListFileName, + args.numberOfProcessors, + ) + print( + "There are {0} alignments, of which {1} are completely " + "within a blacklist region.".format(bam_mapped_total, blacklisted) + ) num_kept_reads = bam_mapped_total - blacklisted else: num_kept_reads = bam_mapped_total ftk = fraction_kept(args, stats) if ftk < 1: num_kept_reads *= ftk - print("Due to filtering, {0}% of the aforementioned alignments " - "will be used {1}".format(100 * ftk, num_kept_reads)) + print( + "Due to filtering, {0}% of the aforementioned alignments " + "will be used {1}".format(100 * ftk, num_kept_reads) + ) return num_kept_reads, bam_mapped_total @@ -213,50 +245,74 @@ def get_num_kept_reads(args, stats): def get_scale_factor(args, stats): scale_factor = args.scaleFactor bam_mapped, bam_mapped_total = get_num_kept_reads(args, stats) - if args.normalizeUsing == 'RPGC': + if args.normalizeUsing == "RPGC": # Print output, since normalzation stuff isn't printed to stderr otherwise - sys.stderr.write("normalization: 1x (effective genome size {})\n".format(args.effectiveGenomeSize)) + sys.stderr.write( + "normalization: 1x (effective genome size {})\n".format( + args.effectiveGenomeSize + ) + ) # try to guess fragment length if the bam file contains paired end reads from deeptools.getFragmentAndReadSize import get_read_and_fragment_length - frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bam, - return_lengths=False, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose) + + frag_len_dict, read_len_dict = get_read_and_fragment_length( + args.bam, + return_lengths=False, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + ) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: - fragment_length = frag_len_dict['median'] + fragment_length = frag_len_dict["median"] else: - exit("*ERROR*: library is not paired-end. Please provide an extension length.") + exit( + "*ERROR*: library is not paired-end. Please provide an extension length." + ) if args.verbose: - print(("Fragment length based on paired en data " - "estimated to be {}".format(frag_len_dict['median']))) + print( + ( + "Fragment length based on paired en data " + "estimated to be {}".format(frag_len_dict["median"]) + ) + ) elif args.extendReads < 1: - exit("*ERROR*: read extension must be bigger than one. Value give: {} ".format(args.extendReads)) + exit( + "*ERROR*: read extension must be bigger than one. Value give: {} ".format( + args.extendReads + ) + ) elif args.extendReads > 2000: - exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) + exit( + "*ERROR*: read extension must be smaller that 2000. Value give: {} ".format( + args.extendReads + ) + ) else: fragment_length = args.extendReads else: # set as fragment length the read length - fragment_length = int(read_len_dict['median']) + fragment_length = int(read_len_dict["median"]) if args.verbose: - print("Estimated read length is {}".format(int(read_len_dict['median']))) + print( + "Estimated read length is {}".format(int(read_len_dict["median"])) + ) - current_coverage = \ + current_coverage = ( float(bam_mapped * fragment_length) / args.effectiveGenomeSize + ) # the scaling sets the coverage to match 1x scale_factor *= 1.0 / current_coverage if debug: print("Estimated current coverage {}".format(current_coverage)) print("Scaling factor {}".format(args.scaleFactor)) - elif args.normalizeUsing == 'RPKM': + elif args.normalizeUsing == "RPKM": # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write("normalization: RPKM\n") @@ -270,7 +326,7 @@ def get_scale_factor(args, stats): if debug: print("scale factor using RPKM is {0}".format(args.scaleFactor)) - elif args.normalizeUsing == 'CPM': + elif args.normalizeUsing == "CPM": # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write("normalization: CPM\n") @@ -281,7 +337,7 @@ def get_scale_factor(args, stats): if debug: print("scale factor using CPM is {0}".format(args.scaleFactor)) - elif args.normalizeUsing == 'BPM': + elif args.normalizeUsing == "BPM": # Print output, since normalzation stuff isn't printed to stderr otherwise sys.stderr.write("normalization: BPM\n") # the BPM (norm is based on post-filtering total counts of reads in BAM "bam_mapped") @@ -295,7 +351,9 @@ def get_scale_factor(args, stats): else: # Print output, since normalzation stuff isn't printed to stderr otherwise - sys.stderr.write("normalization: none (signal scaled by the fraction of alignments kept after filtering)\n") + sys.stderr.write( + "normalization: none (signal scaled by the fraction of alignments kept after filtering)\n" + ) scale_factor *= bam_mapped / float(bam_mapped_total) diff --git a/deeptools/getScorePerBigWigBin.py b/deeptools/getScorePerBigWigBin.py index 6f34f288c..411f4ec9a 100644 --- a/deeptools/getScorePerBigWigBin.py +++ b/deeptools/getScorePerBigWigBin.py @@ -8,9 +8,10 @@ # deepTools packages import deeptools.mapReduce as mapReduce import deeptools.utilities + # debug = 0 -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def countReadsInRegions_wrapper(args): @@ -18,13 +19,10 @@ def countReadsInRegions_wrapper(args): return countFragmentsInRegions_worker(*args) -def countFragmentsInRegions_worker(chrom, start, end, - bigWigFiles, - stepSize, binLength, - save_data, - bedRegions=None - ): - """ returns the average score in each bigwig file at each 'stepSize' +def countFragmentsInRegions_worker( + chrom, start, end, bigWigFiles, stepSize, binLength, save_data, bedRegions=None +): + """returns the average score in each bigwig file at each 'stepSize' position within the interval start, end for a 'binLength' window. Because the idea is to get counts for window positions at different positions for sampling the bins are equally spaced @@ -78,10 +76,10 @@ def countFragmentsInRegions_worker(chrom, start, end, regions_to_consider.append([(i, i + binLength)]) if save_data: - _file = open(deeptools.utilities.getTempFileName(suffix='.bed'), 'w+t') + _file = open(deeptools.utilities.getTempFileName(suffix=".bed"), "w+t") _file_name = _file.name else: - _file_name = '' + _file_name = "" warnings.simplefilter("default") i = 0 for reg in regions_to_consider: @@ -91,14 +89,18 @@ def countFragmentsInRegions_worker(chrom, start, end, for idx, bwh in enumerate(bigwig_handles): if chrom not in bwh.chroms(): unmod_name = chrom - if chrom.startswith('chr'): + if chrom.startswith("chr"): # remove the chr part from chromosome name chrom = chrom[3:] else: # prefix with 'chr' the chromosome name - chrom = 'chr' + chrom + chrom = "chr" + chrom if chrom not in bwh.chroms(): - exit('Chromosome name {} not found in bigwig file\n {}\n'.format(unmod_name, bigWigFiles[idx])) + exit( + "Chromosome name {} not found in bigwig file\n {}\n".format( + unmod_name, bigWigFiles[idx] + ) + ) weights = [] scores = [] @@ -109,7 +111,9 @@ def countFragmentsInRegions_worker(chrom, start, end, if score is None or score == [None] or np.isnan(score[0]): score = [np.nan] scores.extend(score) - avgReadsArray.append(np.average(scores, weights=weights)) # mean of fragment coverage for region + avgReadsArray.append( + np.average(scores, weights=weights) + ) # mean of fragment coverage for region sub_score_per_bin.extend(avgReadsArray) rows += 1 @@ -146,6 +150,7 @@ def getChromSizes(bigwigFilesList): Chromosome name(s) and size(s). >>> assert(getChromSizes([test.bwFile1, test.bwFile2]) == ([('3R', 200)], set([]))) """ + def print_chr_names_and_size(chr_set): sys.stderr.write("chromosome\tlength\n") for name, size in chr_set: @@ -166,18 +171,22 @@ def print_chr_names_and_size(chr_set): # try to add remove 'chr' from the chromosme name _corr_names_size = set() for chrom_name, size in _names_and_size: - if chrom_name.startswith('chr'): + if chrom_name.startswith("chr"): _corr_names_size.add((chrom_name[3:], size)) else: - _corr_names_size.add(('chr' + chrom_name, size)) + _corr_names_size.add(("chr" + chrom_name, size)) if len(common_chr & _corr_names_size) == 0: - message = "No common chromosomes found. Are the bigwig files " \ - "from the same species and same assemblies?\n" + message = ( + "No common chromosomes found. Are the bigwig files " + "from the same species and same assemblies?\n" + ) sys.stderr.write(message) print_chr_names_and_size(common_chr) - sys.stderr.write("\nand the following is the list of the unmatched chromosome and chromosome\n" - "lengths from file\n{}\n".format(bw)) + sys.stderr.write( + "\nand the following is the list of the unmatched chromosome and chromosome\n" + "lengths from file\n{}\n".format(bw) + ) print_chr_names_and_size(_names_and_size) exit(1) else: @@ -187,22 +196,28 @@ def print_chr_names_and_size(chr_set): common_chr = common_chr & _names_and_size if len(non_common_chr) > 0: - sys.stderr.write("\nThe following chromosome names did not match between the bigwig files\n") + sys.stderr.write( + "\nThe following chromosome names did not match between the bigwig files\n" + ) print_chr_names_and_size(non_common_chr) # get the list of common chromosome names and sizes return sorted(common_chr), non_common_chr -def getScorePerBin(bigWigFiles, binLength, - numberOfProcessors=1, - verbose=False, region=None, - bedFile=None, - blackListFileName=None, - stepSize=None, - chrsToSkip=[], - out_file_for_raw_data=None, - allArgs=None): +def getScorePerBin( + bigWigFiles, + binLength, + numberOfProcessors=1, + verbose=False, + region=None, + bedFile=None, + blackListFileName=None, + stepSize=None, + chrsToSkip=[], + out_file_for_raw_data=None, + allArgs=None, +): """ This function returns a matrix containing scores (median) for the coverage of fragments within a region. Each row corresponds to a sampled region. @@ -252,32 +267,41 @@ def getScorePerBin(bigWigFiles, binLength, save_file = False # Handle GTF options - transcriptID, exonID, transcript_id_designator, keepExons = deeptools.utilities.gtfOptions(allArgs) - - imap_res = mapReduce.mapReduce((bigWigFiles, stepSize, binLength, save_file), - countReadsInRegions_wrapper, - chrom_sizes, - genomeChunkLength=chunkSize, - bedFile=bedFile, - blackListFileName=blackListFileName, - region=region, - numberOfProcessors=numberOfProcessors, - transcriptID=transcriptID, - exonID=exonID, - keepExons=keepExons, - transcript_id_designator=transcript_id_designator) + ( + transcriptID, + exonID, + transcript_id_designator, + keepExons, + ) = deeptools.utilities.gtfOptions(allArgs) + + imap_res = mapReduce.mapReduce( + (bigWigFiles, stepSize, binLength, save_file), + countReadsInRegions_wrapper, + chrom_sizes, + genomeChunkLength=chunkSize, + bedFile=bedFile, + blackListFileName=blackListFileName, + region=region, + numberOfProcessors=numberOfProcessors, + transcriptID=transcriptID, + exonID=exonID, + keepExons=keepExons, + transcript_id_designator=transcript_id_designator, + ) if out_file_for_raw_data: if len(non_common): - sys.stderr.write("*Warning*\nThe resulting bed file does not contain information for " - "the chromosomes that were not common between the bigwig files\n") + sys.stderr.write( + "*Warning*\nThe resulting bed file does not contain information for " + "the chromosomes that were not common between the bigwig files\n" + ) # concatenate intermediary bedgraph files ofile = open(out_file_for_raw_data, "w") for _values, tempFileName in imap_res: if tempFileName: # concatenate all intermediate tempfiles into one - f = open(tempFileName, 'r') + f = open(tempFileName, "r") shutil.copyfileobj(f, ofile) f.close() os.remove(tempFileName) @@ -290,7 +314,6 @@ def getScorePerBin(bigWigFiles, binLength, class Tester(object): - def __init__(self): """ The the two bigWig files are as follows: @@ -317,6 +340,6 @@ def __init__(self): self.bwFile1 = self.root + "testA.bw" self.bwFile2 = self.root + "testB.bw" self.bwFile_PE = self.root + "test_paired2.bw" - self.chrom = '3R' + self.chrom = "3R" # global debug # debug = 0 diff --git a/deeptools/heatmapper.py b/deeptools/heatmapper.py index d0eebaad2..84557dc42 100644 --- a/deeptools/heatmapper.py +++ b/deeptools/heatmapper.py @@ -11,7 +11,7 @@ from deeptools.heatmapper_utilities import getProfileTicks -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def chopRegions(exonsInput, left=0, right=0): @@ -133,7 +133,7 @@ def chopRegionsFromMiddle(exonsInput, left=0, right=0): rSum += size if rSum == right: break - rightBins = rightBins[:i + 1] + rightBins = rightBins[: i + 1] elif rSum < right: padRight = right - rSum @@ -188,51 +188,90 @@ def __init__(self): self.blackList = None self.quiet = True # These are parameters that were single values in versions <3 but are now internally lists. See issue #614 - self.special_params = set(['unscaled 5 prime', 'unscaled 3 prime', 'body', 'downstream', 'upstream', 'ref point', 'bin size']) + self.special_params = set( + [ + "unscaled 5 prime", + "unscaled 3 prime", + "body", + "downstream", + "upstream", + "ref point", + "bin size", + ] + ) def getTicks(self, idx): """ This is essentially a wrapper around getProfileTicks to accomdate the fact that each column has its own ticks. """ - xticks, xtickslabel = getProfileTicks(self, self.reference_point_label[idx], self.startLabel, self.endLabel, idx) + xticks, xtickslabel = getProfileTicks( + self, self.reference_point_label[idx], self.startLabel, self.endLabel, idx + ) return xticks, xtickslabel - def computeMatrix(self, score_file_list, regions_file, parameters, blackListFileName=None, verbose=False, allArgs=None): + def computeMatrix( + self, + score_file_list, + regions_file, + parameters, + blackListFileName=None, + verbose=False, + allArgs=None, + ): """ Splits into multiple cores the computation of the scores per bin for each region (defined by a hash '#' in the regions (BED/GFF) file. """ - if parameters['body'] > 0 and \ - parameters['body'] % parameters['bin size'] > 0: - exit("The --regionBodyLength has to be " - "a multiple of --binSize.\nCurrently the " - "values are {} {} for\nregionsBodyLength and " - "binSize respectively\n".format(parameters['body'], - parameters['bin size'])) + if parameters["body"] > 0 and parameters["body"] % parameters["bin size"] > 0: + exit( + "The --regionBodyLength has to be " + "a multiple of --binSize.\nCurrently the " + "values are {} {} for\nregionsBodyLength and " + "binSize respectively\n".format( + parameters["body"], parameters["bin size"] + ) + ) # the beforeRegionStartLength is extended such that # length is a multiple of binSize - if parameters['downstream'] % parameters['bin size'] > 0: - exit("Length of region after the body has to be " - "a multiple of --binSize.\nCurrent value " - "is {}\n".format(parameters['downstream'])) - - if parameters['upstream'] % parameters['bin size'] > 0: - exit("Length of region before the body has to be a multiple of " - "--binSize\nCurrent value is {}\n".format(parameters['upstream'])) - - if parameters['unscaled 5 prime'] % parameters['bin size'] > 0: - exit("Length of the unscaled 5 prime region has to be a multiple of " - "--binSize\nCurrent value is {}\n".format(parameters['unscaled 5 prime'])) - - if parameters['unscaled 3 prime'] % parameters['bin size'] > 0: - exit("Length of the unscaled 5 prime region has to be a multiple of " - "--binSize\nCurrent value is {}\n".format(parameters['unscaled 3 prime'])) - - if parameters['unscaled 5 prime'] + parameters['unscaled 3 prime'] > 0 and parameters['body'] == 0: - exit('Unscaled 5- and 3-prime regions only make sense with the scale-regions subcommand.\n') + if parameters["downstream"] % parameters["bin size"] > 0: + exit( + "Length of region after the body has to be " + "a multiple of --binSize.\nCurrent value " + "is {}\n".format(parameters["downstream"]) + ) + + if parameters["upstream"] % parameters["bin size"] > 0: + exit( + "Length of region before the body has to be a multiple of " + "--binSize\nCurrent value is {}\n".format(parameters["upstream"]) + ) + + if parameters["unscaled 5 prime"] % parameters["bin size"] > 0: + exit( + "Length of the unscaled 5 prime region has to be a multiple of " + "--binSize\nCurrent value is {}\n".format( + parameters["unscaled 5 prime"] + ) + ) + + if parameters["unscaled 3 prime"] % parameters["bin size"] > 0: + exit( + "Length of the unscaled 5 prime region has to be a multiple of " + "--binSize\nCurrent value is {}\n".format( + parameters["unscaled 3 prime"] + ) + ) + + if ( + parameters["unscaled 5 prime"] + parameters["unscaled 3 prime"] > 0 + and parameters["body"] == 0 + ): + exit( + "Unscaled 5- and 3-prime regions only make sense with the scale-regions subcommand.\n" + ) # Take care of GTF options transcriptID = "transcript" @@ -244,24 +283,28 @@ def computeMatrix(self, score_file_list, regions_file, parameters, blackListFile allArgs = vars(allArgs) transcriptID = allArgs.get("transcriptID", transcriptID) exonID = allArgs.get("exonID", exonID) - transcript_id_designator = allArgs.get("transcript_id_designator", transcript_id_designator) + transcript_id_designator = allArgs.get( + "transcript_id_designator", transcript_id_designator + ) keepExons = allArgs.get("keepExons", keepExons) self.quiet = allArgs.get("quiet", self.quiet) chromSizes, _ = getScorePerBigWigBin.getChromSizes(score_file_list) - res, labels = mapReduce.mapReduce([score_file_list, parameters], - compute_sub_matrix_wrapper, - chromSizes, - self_=self, - bedFile=regions_file, - blackListFileName=blackListFileName, - numberOfProcessors=parameters['proc number'], - includeLabels=True, - transcriptID=transcriptID, - exonID=exonID, - transcript_id_designator=transcript_id_designator, - keepExons=keepExons, - verbose=verbose) + res, labels = mapReduce.mapReduce( + [score_file_list, parameters], + compute_sub_matrix_wrapper, + chromSizes, + self_=self, + bedFile=regions_file, + blackListFileName=blackListFileName, + numberOfProcessors=parameters["proc number"], + includeLabels=True, + transcriptID=transcriptID, + exonID=exonID, + transcript_id_designator=transcript_id_designator, + keepExons=keepExons, + verbose=verbose, + ) # each worker in the pool returns a tuple containing # the submatrix data, the regions that correspond to the # submatrix, and the number of regions lacking scores @@ -284,37 +327,43 @@ def computeMatrix(self, score_file_list, regions_file, parameters, blackListFile # mask invalid (nan) values matrix = np.ma.masked_invalid(matrix) - assert matrix.shape[0] == len(regions), \ - "matrix length does not match regions length" + assert matrix.shape[0] == len( + regions + ), "matrix length does not match regions length" if len(regions) == 0: - sys.stderr.write("\nERROR: Either the BED file does not contain any valid regions or there are none remaining after filtering.\n") + sys.stderr.write( + "\nERROR: Either the BED file does not contain any valid regions or there are none remaining after filtering.\n" + ) exit(1) if regions_no_score == len(regions): - exit("\nERROR: None of the BED regions could be found in the bigWig" - "file.\nPlease check that the bigwig file is valid and " - "that the chromosome names between the BED file and " - "the bigWig file correspond to each other\n") + exit( + "\nERROR: None of the BED regions could be found in the bigWig" + "file.\nPlease check that the bigwig file is valid and " + "that the chromosome names between the BED file and " + "the bigWig file correspond to each other\n" + ) if regions_no_score > len(regions) * 0.75: - file_type = 'bigwig' if score_file_list[0].endswith(".bw") else "BAM" + file_type = "bigwig" if score_file_list[0].endswith(".bw") else "BAM" prcnt = 100 * float(regions_no_score) / len(regions) sys.stderr.write( "\n\nWarning: {0:.2f}% of regions are *not* associated\n" "to any score in the given {1} file. Check that the\n" "chromosome names from the BED file are consistent with\n" "the chromosome names in the given {2} file and that both\n" - "files refer to the same species\n\n".format(prcnt, - file_type, - file_type)) + "files refer to the same species\n\n".format( + prcnt, file_type, file_type + ) + ) self.parameters = parameters numcols = matrix.shape[1] num_ind_cols = self.get_num_individual_matrix_cols() sample_boundaries = list(range(0, numcols + num_ind_cols, num_ind_cols)) - if allArgs is not None and allArgs['samplesLabel'] is not None: - sample_labels = allArgs['samplesLabel'] + if allArgs is not None and allArgs["samplesLabel"] is not None: + sample_labels = allArgs["samplesLabel"] else: sample_labels = smartLabels(score_file_list) @@ -339,26 +388,32 @@ def computeMatrix(self, score_file_list, regions_file, parameters, blackListFile sys.stderr.write( "One of the groups defined in the bed file is " "too small.\nGroups that are too small can't be plotted. " - "\n") - - self.matrix = _matrix(regions, matrix, - group_boundaries, - sample_boundaries, - group_labels_filtered, - sample_labels) - - if parameters['skip zeros']: + "\n" + ) + + self.matrix = _matrix( + regions, + matrix, + group_boundaries, + sample_boundaries, + group_labels_filtered, + sample_labels, + ) + + if parameters["skip zeros"]: self.matrix.removeempty() @staticmethod - def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, parameters, regions): + def compute_sub_matrix_worker( + self, chrom, start, end, score_file_list, parameters, regions + ): """ Returns ------- numpy matrix A numpy matrix that contains per each row the values found per each of the regions given """ - if parameters['verbose']: + if parameters["verbose"]: sys.stderr.write("Processing {}:{}-{}\n".format(chrom, start, end)) # read BAM or scores file @@ -368,11 +423,16 @@ def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, paramete # determine the number of matrix columns based on the lengths # given by the user, times the number of score files - matrix_cols = len(score_file_list) * \ - ((parameters['downstream'] + - parameters['unscaled 5 prime'] + parameters['unscaled 3 prime'] + - parameters['upstream'] + parameters['body']) // - parameters['bin size']) + matrix_cols = len(score_file_list) * ( + ( + parameters["downstream"] + + parameters["unscaled 5 prime"] + + parameters["unscaled 3 prime"] + + parameters["upstream"] + + parameters["body"] + ) + // parameters["bin size"] + ) # create an empty matrix to store the values sub_matrix = np.zeros((len(regions), matrix_cols)) @@ -396,42 +456,83 @@ def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, paramete downstream = [] # get the body length - body_length = np.sum([x[1] - x[0] for x in exons]) - parameters['unscaled 5 prime'] - parameters['unscaled 3 prime'] + body_length = ( + np.sum([x[1] - x[0] for x in exons]) + - parameters["unscaled 5 prime"] + - parameters["unscaled 3 prime"] + ) # print some information - if parameters['body'] > 0 and \ - body_length < parameters['bin size']: + if parameters["body"] > 0 and body_length < parameters["bin size"]: if not self.quiet: - sys.stderr.write("A region that is shorter than the bin size (possibly only after accounting for unscaled regions) was found: " - "({0}) {1} {2}:{3}:{4}. Skipping...\n".format((body_length - parameters['unscaled 5 prime'] - parameters['unscaled 3 prime']), - feature_name, feature_chrom, - feature_start, feature_end)) + sys.stderr.write( + "A region that is shorter than the bin size (possibly only after accounting for unscaled regions) was found: " + "({0}) {1} {2}:{3}:{4}. Skipping...\n".format( + ( + body_length + - parameters["unscaled 5 prime"] + - parameters["unscaled 3 prime"] + ), + feature_name, + feature_chrom, + feature_start, + feature_end, + ) + ) coverage = np.zeros(matrix_cols) - if not parameters['missing data as zero']: + if not parameters["missing data as zero"]: coverage[:] = np.nan else: - if feature_strand == '-': - if parameters['downstream'] > 0: - upstream = [(feature_start - parameters['downstream'], feature_start)] - if parameters['upstream'] > 0: - downstream = [(feature_end, feature_end + parameters['upstream'])] - unscaled5prime, body, unscaled3prime, padLeft, padRight = chopRegions(exons, left=parameters['unscaled 3 prime'], right=parameters['unscaled 5 prime']) + if feature_strand == "-": + if parameters["downstream"] > 0: + upstream = [ + (feature_start - parameters["downstream"], feature_start) + ] + if parameters["upstream"] > 0: + downstream = [ + (feature_end, feature_end + parameters["upstream"]) + ] + ( + unscaled5prime, + body, + unscaled3prime, + padLeft, + padRight, + ) = chopRegions( + exons, + left=parameters["unscaled 3 prime"], + right=parameters["unscaled 5 prime"], + ) # bins per zone - a = parameters['downstream'] // parameters['bin size'] - b = parameters['unscaled 3 prime'] // parameters['bin size'] - d = parameters['unscaled 5 prime'] // parameters['bin size'] - e = parameters['upstream'] // parameters['bin size'] + a = parameters["downstream"] // parameters["bin size"] + b = parameters["unscaled 3 prime"] // parameters["bin size"] + d = parameters["unscaled 5 prime"] // parameters["bin size"] + e = parameters["upstream"] // parameters["bin size"] else: - if parameters['upstream'] > 0: - upstream = [(feature_start - parameters['upstream'], feature_start)] - if parameters['downstream'] > 0: - downstream = [(feature_end, feature_end + parameters['downstream'])] - unscaled5prime, body, unscaled3prime, padLeft, padRight = chopRegions(exons, left=parameters['unscaled 5 prime'], right=parameters['unscaled 3 prime']) - a = parameters['upstream'] // parameters['bin size'] - b = parameters['unscaled 5 prime'] // parameters['bin size'] - d = parameters['unscaled 3 prime'] // parameters['bin size'] - e = parameters['downstream'] // parameters['bin size'] - c = parameters['body'] // parameters['bin size'] + if parameters["upstream"] > 0: + upstream = [ + (feature_start - parameters["upstream"], feature_start) + ] + if parameters["downstream"] > 0: + downstream = [ + (feature_end, feature_end + parameters["downstream"]) + ] + ( + unscaled5prime, + body, + unscaled3prime, + padLeft, + padRight, + ) = chopRegions( + exons, + left=parameters["unscaled 5 prime"], + right=parameters["unscaled 3 prime"], + ) + a = parameters["upstream"] // parameters["bin size"] + b = parameters["unscaled 5 prime"] // parameters["bin size"] + d = parameters["unscaled 3 prime"] // parameters["bin size"] + e = parameters["downstream"] // parameters["bin size"] + c = parameters["body"] // parameters["bin size"] # build zones (each is a list of tuples) # zone0: region before the region start, @@ -443,46 +544,84 @@ def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, paramete # Note that for "reference-point", upstream/downstream will go # through the exons (if requested) and then possibly continue # on the other side (unless parameters['nan after end'] is true) - if parameters['body'] > 0: - zones = [(upstream, a), (unscaled5prime, b), (body, c), (unscaled3prime, d), (downstream, e)] - elif parameters['ref point'] == 'TES': # around TES - if feature_strand == '-': - downstream, body, unscaled3prime, padRight, _ = chopRegions(exons, left=parameters['upstream']) - if padRight > 0 and parameters['nan after end'] is True: + if parameters["body"] > 0: + zones = [ + (upstream, a), + (unscaled5prime, b), + (body, c), + (unscaled3prime, d), + (downstream, e), + ] + elif parameters["ref point"] == "TES": # around TES + if feature_strand == "-": + downstream, body, unscaled3prime, padRight, _ = chopRegions( + exons, left=parameters["upstream"] + ) + if padRight > 0 and parameters["nan after end"] is True: padRightNaN += padRight elif padRight > 0: - downstream.append((downstream[-1][1], downstream[-1][1] + padRight)) + downstream.append( + (downstream[-1][1], downstream[-1][1] + padRight) + ) padRight = 0 else: - unscale5prime, body, upstream, _, padLeft = chopRegions(exons, right=parameters['upstream']) - if padLeft > 0 and parameters['nan after end'] is True: + unscale5prime, body, upstream, _, padLeft = chopRegions( + exons, right=parameters["upstream"] + ) + if padLeft > 0 and parameters["nan after end"] is True: padLeftNaN += padLeft elif padLeft > 0: - upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0])) + upstream.insert( + 0, (upstream[0][0] - padLeft, upstream[0][0]) + ) padLeft = 0 - e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size'] - a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size'] + e = ( + np.sum([x[1] - x[0] for x in downstream]) + // parameters["bin size"] + ) + a = ( + np.sum([x[1] - x[0] for x in upstream]) + // parameters["bin size"] + ) zones = [(upstream, a), (downstream, e)] - elif parameters['ref point'] == 'center': # at the region center - if feature_strand == '-': - upstream, downstream, padLeft, padRight = chopRegionsFromMiddle(exons, left=parameters['downstream'], right=parameters['upstream']) + elif parameters["ref point"] == "center": # at the region center + if feature_strand == "-": + upstream, downstream, padLeft, padRight = chopRegionsFromMiddle( + exons, + left=parameters["downstream"], + right=parameters["upstream"], + ) else: - upstream, downstream, padLeft, padRight = chopRegionsFromMiddle(exons, left=parameters['upstream'], right=parameters['downstream']) - if padLeft > 0 and parameters['nan after end'] is True: + upstream, downstream, padLeft, padRight = chopRegionsFromMiddle( + exons, + left=parameters["upstream"], + right=parameters["downstream"], + ) + if padLeft > 0 and parameters["nan after end"] is True: padLeftNaN += padLeft elif padLeft > 0: if len(upstream) > 0: - upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0])) + upstream.insert( + 0, (upstream[0][0] - padLeft, upstream[0][0]) + ) else: upstream = [(downstream[0][0] - padLeft, downstream[0][0])] padLeft = 0 - if padRight > 0 and parameters['nan after end'] is True: + if padRight > 0 and parameters["nan after end"] is True: padRightNaN += padRight elif padRight > 0: - downstream.append((downstream[-1][1], downstream[-1][1] + padRight)) + downstream.append( + (downstream[-1][1], downstream[-1][1] + padRight) + ) padRight = 0 - a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size'] - e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size'] + a = ( + np.sum([x[1] - x[0] for x in upstream]) + // parameters["bin size"] + ) + e = ( + np.sum([x[1] - x[0] for x in downstream]) + // parameters["bin size"] + ) # It's possible for a/e to be floats or 0 yet upstream/downstream isn't empty if a < 1: upstream = [] @@ -492,36 +631,52 @@ def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, paramete e = 0 zones = [(upstream, a), (downstream, e)] else: # around TSS - if feature_strand == '-': - unscale5prime, body, upstream, _, padLeft = chopRegions(exons, right=parameters['downstream']) - if padLeft > 0 and parameters['nan after end'] is True: + if feature_strand == "-": + unscale5prime, body, upstream, _, padLeft = chopRegions( + exons, right=parameters["downstream"] + ) + if padLeft > 0 and parameters["nan after end"] is True: padLeftNaN += padLeft elif padLeft > 0: - upstream.insert(0, (upstream[0][0] - padLeft, upstream[0][0])) + upstream.insert( + 0, (upstream[0][0] - padLeft, upstream[0][0]) + ) padLeft = 0 else: - downstream, body, unscaled3prime, padRight, _ = chopRegions(exons, left=parameters['downstream']) - if padRight > 0 and parameters['nan after end'] is True: + downstream, body, unscaled3prime, padRight, _ = chopRegions( + exons, left=parameters["downstream"] + ) + if padRight > 0 and parameters["nan after end"] is True: padRightNaN += padRight elif padRight > 0: - downstream.append((downstream[-1][1], downstream[-1][1] + padRight)) + downstream.append( + (downstream[-1][1], downstream[-1][1] + padRight) + ) padRight = 0 - a = np.sum([x[1] - x[0] for x in upstream]) // parameters['bin size'] - e = np.sum([x[1] - x[0] for x in downstream]) // parameters['bin size'] + a = ( + np.sum([x[1] - x[0] for x in upstream]) + // parameters["bin size"] + ) + e = ( + np.sum([x[1] - x[0] for x in downstream]) + // parameters["bin size"] + ) zones = [(upstream, a), (downstream, e)] - foo = parameters['upstream'] - bar = parameters['downstream'] - if feature_strand == '-': + foo = parameters["upstream"] + bar = parameters["downstream"] + if feature_strand == "-": foo, bar = bar, foo if padLeftNaN > 0: - expected = foo // parameters['bin size'] - padLeftNaN = int(round(float(padLeftNaN) / parameters['bin size'])) + expected = foo // parameters["bin size"] + padLeftNaN = int(round(float(padLeftNaN) / parameters["bin size"])) if expected - padLeftNaN - a > 0: padLeftNaN += 1 if padRightNaN > 0: - expected = bar // parameters['bin size'] - padRightNaN = int(round(float(padRightNaN) / parameters['bin size'])) + expected = bar // parameters["bin size"] + padRightNaN = int( + round(float(padRightNaN) / parameters["bin size"]) + ) if expected - padRightNaN - e > 0: padRightNaN += 1 @@ -531,11 +686,14 @@ def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, paramete for sc_handler in score_file_handles: # We're only supporting bigWig files at this point cov = heatmapper.coverage_from_big_wig( - sc_handler, feature_chrom, zones, - parameters['bin size'], - parameters['bin avg type'], - parameters['missing data as zero'], - not self.quiet) + sc_handler, + feature_chrom, + zones, + parameters["bin size"], + parameters["bin avg type"], + parameters["missing data as zero"], + not self.quiet, + ) if padLeftNaN > 0: cov = np.concatenate([[np.nan] * padLeftNaN, cov]) @@ -553,11 +711,12 @@ def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, paramete sys.stderr.write( "No data was found for region " "{0} {1}:{2}-{3}. Skipping...\n".format( - feature_name, feature_chrom, - feature_start, feature_end)) + feature_name, feature_chrom, feature_start, feature_end + ) + ) coverage = np.zeros(matrix_cols) - if not parameters['missing data as zero']: + if not parameters["missing data as zero"]: coverage[:] = np.nan try: @@ -567,20 +726,26 @@ def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, paramete if not self.quiet: sys.stderr.write( "No scores defined for region " - "{0} {1}:{2}-{3}. Skipping...\n".format(feature_name, - feature_chrom, - feature_start, - feature_end)) + "{0} {1}:{2}-{3}. Skipping...\n".format( + feature_name, feature_chrom, feature_start, feature_end + ) + ) coverage = np.zeros(matrix_cols) - if not parameters['missing data as zero']: + if not parameters["missing data as zero"]: coverage[:] = np.nan - if parameters['min threshold'] is not None and coverage.min() <= parameters['min threshold']: + if ( + parameters["min threshold"] is not None + and coverage.min() <= parameters["min threshold"] + ): continue - if parameters['max threshold'] is not None and coverage.max() >= parameters['max threshold']: + if ( + parameters["max threshold"] is not None + and coverage.max() >= parameters["max threshold"] + ): continue - if parameters['scale'] != 1: - coverage = parameters['scale'] * coverage + if parameters["scale"] != 1: + coverage = parameters["scale"] * coverage sub_matrix[j, :] = coverage @@ -598,7 +763,11 @@ def coverage_from_array(valuesArray, zones, binSize, avgType): try: valuesArray[0] except (IndexError, TypeError) as detail: - sys.stderr.write("{0}\nvalues array value: {1}, zones {2}\n".format(detail, valuesArray, zones)) + sys.stderr.write( + "{0}\nvalues array value: {1}, zones {2}\n".format( + detail, valuesArray, zones + ) + ) cvglist = [] zoneEnd = 0 @@ -616,7 +785,9 @@ def coverage_from_array(valuesArray, zones, binSize, avgType): if nBins == 1: pos_array = np.array([valStart]) else: - pos_array = np.linspace(valStart, valEnd, nBins, endpoint=False, dtype=int) + pos_array = np.linspace( + valStart, valEnd, nBins, endpoint=False, dtype=int + ) pos_array = np.append(pos_array, valEnd) idx = 0 @@ -624,7 +795,9 @@ def coverage_from_array(valuesArray, zones, binSize, avgType): idxStart = int(pos_array[idx]) idxEnd = max(int(pos_array[idx + 1]), idxStart + 1) try: - counts_list.append(heatmapper.my_average(valuesArray[idxStart:idxEnd], avgType)) + counts_list.append( + heatmapper.my_average(valuesArray[idxStart:idxEnd], avgType) + ) except Exception as detail: sys.stderr.write("Exception found: {0}\n".format(detail)) idx += 1 @@ -638,22 +811,23 @@ def change_chrom_names(chrom): Changes UCSC chromosome names to ensembl chromosome names and vice versa. """ - if chrom.startswith('chr'): + if chrom.startswith("chr"): # remove the chr part from chromosome name chrom = chrom[3:] if chrom == "M": chrom = "MT" else: # prefix with 'chr' the chromosome name - chrom = 'chr' + chrom + chrom = "chr" + chrom if chrom == "chrMT": chrom = "chrM" return chrom @staticmethod - def coverage_from_big_wig(bigwig, chrom, zones, binSize, avgType, nansAsZeros=False, verbose=True): - + def coverage_from_big_wig( + bigwig, chrom, zones, binSize, avgType, nansAsZeros=False, verbose=True + ): """ uses pyBigWig to query a region define by chrom and zones. @@ -693,13 +867,17 @@ def coverage_from_big_wig(bigwig, chrom, zones, binSize, avgType, nansAsZeros=Fa chrom = heatmapper.change_chrom_names(chrom) if chrom not in list(bigwig.chroms().keys()): if verbose: - sys.stderr.write("Warning: Your chromosome names do not match.\nPlease check that the " - "chromosome names in your BED file\ncorrespond to the names in your " - "bigWig file.\nAn empty line will be added to your heatmap.\nThe problematic " - "chromosome name is {0}\n\n".format(unmod_name)) + sys.stderr.write( + "Warning: Your chromosome names do not match.\nPlease check that the " + "chromosome names in your BED file\ncorrespond to the names in your " + "bigWig file.\nAn empty line will be added to your heatmap.\nThe problematic " + "chromosome name is {0}\n\n".format(unmod_name) + ) # return empty nan array - return heatmapper.coverage_from_array(values_array, zones, binSize, avgType) + return heatmapper.coverage_from_array( + values_array, zones, binSize, avgType + ) maxLen = bigwig.chroms(chrom) startIdx = 0 @@ -726,11 +904,10 @@ def coverage_from_big_wig(bigwig, chrom, zones, binSize, avgType, nansAsZeros=Fa if nansAsZeros: values_array[np.isnan(values_array)] = 0 - return heatmapper.coverage_from_array(values_array, zones, - binSize, avgType) + return heatmapper.coverage_from_array(values_array, zones, binSize, avgType) @staticmethod - def my_average(valuesArray, avgType='mean'): + def my_average(valuesArray, avgType="mean"): """ computes the mean, median, etc but only for those values that are not Nan @@ -757,6 +934,7 @@ def read_matrix_file(self, matrix_file): # to split the heatmap into groups import json + regions = [] matrix_rows = [] current_group_index = 0 @@ -771,11 +949,11 @@ def read_matrix_file(self, matrix_file): # the parameters used are saved using # json self.parameters = json.loads(line[1:].strip()) - max_group_bound = self.parameters['group_boundaries'][1] + max_group_bound = self.parameters["group_boundaries"][1] continue # split the line into bed interval and matrix values - region = line.split('\t') + region = line.split("\t") chrom, start, end, name, score, strand = region[0:6] matrix_row = np.ma.masked_invalid(np.fromiter(region[6:], np.float)) matrix_rows.append(matrix_row) @@ -785,18 +963,25 @@ def read_matrix_file(self, matrix_file): # get the group index if len(regions) >= max_group_bound: current_group_index += 1 - max_group_bound = self.parameters['group_boundaries'][current_group_index + 1] + max_group_bound = self.parameters["group_boundaries"][ + current_group_index + 1 + ] regions.append([chrom, regs, name, max_group_bound, strand, score]) matrix = np.vstack(matrix_rows) - self.matrix = _matrix(regions, matrix, self.parameters['group_boundaries'], - self.parameters['sample_boundaries'], - group_labels=self.parameters['group_labels'], - sample_labels=self.parameters['sample_labels']) - - if 'sort regions' in self.parameters: - self.matrix.set_sorting_method(self.parameters['sort regions'], - self.parameters['sort using']) + self.matrix = _matrix( + regions, + matrix, + self.parameters["group_boundaries"], + self.parameters["sample_boundaries"], + group_labels=self.parameters["group_labels"], + sample_labels=self.parameters["sample_labels"], + ) + + if "sort regions" in self.parameters: + self.matrix.set_sorting_method( + self.parameters["sort regions"], self.parameters["sort using"] + ) # Versions of computeMatrix before 3.0 didn't have an entry of these per column, fix that nSamples = len(self.matrix.sample_labels) @@ -828,10 +1013,11 @@ def save_matrix(self, file_name): The file is gzipped. """ import json - self.parameters['sample_labels'] = self.matrix.sample_labels - self.parameters['group_labels'] = self.matrix.group_labels - self.parameters['sample_boundaries'] = self.matrix.sample_boundaries - self.parameters['group_boundaries'] = self.matrix.group_boundaries + + self.parameters["sample_labels"] = self.matrix.sample_labels + self.parameters["group_labels"] = self.matrix.group_labels + self.parameters["sample_boundaries"] = self.matrix.sample_boundaries + self.parameters["group_boundaries"] = self.matrix.group_boundaries # Redo the parameters, ensuring things related to ticks and labels are repeated appropriately nSamples = len(self.matrix.sample_labels) @@ -844,8 +1030,8 @@ def save_matrix(self, file_name): if len(v) == 0: v = [None] * nSamples h[k] = v - fh = gzip.open(file_name, 'wb') - params_str = json.dumps(h, separators=(',', ':')) + fh = gzip.open(file_name, "wb") + params_str = json.dumps(h, separators=(",", ":")) fh.write(toBytes("@" + params_str + "\n")) score_list = np.ma.masked_invalid(np.mean(self.matrix.matrix, axis=1)) for idx, region in enumerate(self.matrix.regions): @@ -853,25 +1039,35 @@ def save_matrix(self, file_name): # keeping nans while converting them to strings if not np.ma.is_masked(score_list[idx]): np.float(score_list[idx]) - matrix_values = "\t".join( - np.char.mod('%f', self.matrix.matrix[idx, :])) + matrix_values = "\t".join(np.char.mod("%f", self.matrix.matrix[idx, :])) starts = ["{0}".format(x[0]) for x in region[1]] ends = ["{0}".format(x[1]) for x in region[1]] starts = ",".join(starts) ends = ",".join(ends) # BEDish format (we don't currently store the score) fh.write( - toBytes('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n'.format( + toBytes( + "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format( region[0], starts, ends, region[2], region[5], region[4], - matrix_values))) + matrix_values, + ) + ) + ) fh.close() - def save_tabulated_values(self, file_handle, reference_point_label='TSS', start_label='TSS', end_label='TES', averagetype='mean'): + def save_tabulated_values( + self, + file_handle, + reference_point_label="TSS", + start_label="TSS", + end_label="TES", + averagetype="mean", + ): """ Saves the values averaged by col using the avg_type given @@ -885,30 +1081,37 @@ def save_tabulated_values(self, file_handle, reference_point_label='TSS', start_ """ # get X labels - w = self.parameters['bin size'] - b = self.parameters['upstream'] - a = self.parameters['downstream'] - c = self.parameters.get('unscaled 5 prime', 0) - d = self.parameters.get('unscaled 3 prime', 0) - m = self.parameters['body'] + w = self.parameters["bin size"] + b = self.parameters["upstream"] + a = self.parameters["downstream"] + c = self.parameters.get("unscaled 5 prime", 0) + d = self.parameters.get("unscaled 3 prime", 0) + m = self.parameters["body"] xticks = [] xtickslabel = [] for idx in range(self.matrix.get_num_samples()): if b[idx] < 1e5: quotient = 1000 - symbol = 'Kb' + symbol = "Kb" else: quotient = 1e6 - symbol = 'Mb' + symbol = "Mb" if m[idx] == 0: last = 0 if len(xticks): last = xticks[-1] - xticks.extend([last + (k / w[idx]) for k in [w[idx], b[idx], b[idx] + a[idx]]]) - xtickslabel.extend(['{0:.1f}{1}'.format(-(float(b[idx]) / quotient), symbol), reference_point_label, - '{0:.1f}{1}'.format(float(a[idx]) / quotient, symbol)]) + xticks.extend( + [last + (k / w[idx]) for k in [w[idx], b[idx], b[idx] + a[idx]]] + ) + xtickslabel.extend( + [ + "{0:.1f}{1}".format(-(float(b[idx]) / quotient), symbol), + reference_point_label, + "{0:.1f}{1}".format(float(a[idx]) / quotient, symbol), + ] + ) else: xticks_values = [w[idx]] @@ -916,7 +1119,9 @@ def save_tabulated_values(self, file_handle, reference_point_label='TSS', start_ # only if upstream region is set, add a x tick if b[idx] > 0: xticks_values.append(b[idx]) - xtickslabel.append('{0:.1f}{1}'.format(-(float(b[idx]) / quotient), symbol)) + xtickslabel.append( + "{0:.1f}{1}".format(-(float(b[idx]) / quotient), symbol) + ) xtickslabel.append(start_label) @@ -933,7 +1138,9 @@ def save_tabulated_values(self, file_handle, reference_point_label='TSS', start_ if a[idx] > 0: xticks_values.append(b[idx] + c[idx] + m[idx] + d[idx] + a[idx]) - xtickslabel.append('{0:.1f}{1}'.format(float(a[idx]) / quotient, symbol)) + xtickslabel.append( + "{0:.1f}{1}".format(float(a[idx]) / quotient, symbol) + ) last = 0 if len(xticks): @@ -949,34 +1156,47 @@ def save_tabulated_values(self, file_handle, reference_point_label='TSS', start_ else: labs.append("") - with open(file_handle, 'w') as fh: + with open(file_handle, "w") as fh: # write labels fh.write("bin labels\t\t{}\n".format("\t".join(labs))) - fh.write('bins\t\t{}\n'.format("\t".join([str(x) for x in x_axis]))) + fh.write("bins\t\t{}\n".format("\t".join([str(x) for x in x_axis]))) for sample_idx in range(self.matrix.get_num_samples()): for group_idx in range(self.matrix.get_num_groups()): sub_matrix = self.matrix.get_matrix(group_idx, sample_idx) - values = [str(x) for x in np.ma.__getattribute__(averagetype)(sub_matrix['matrix'], axis=0)] - fh.write("{}\t{}\t{}\n".format(sub_matrix['sample'], sub_matrix['group'], "\t".join(values))) + values = [ + str(x) + for x in np.ma.__getattribute__(averagetype)( + sub_matrix["matrix"], axis=0 + ) + ] + fh.write( + "{}\t{}\t{}\n".format( + sub_matrix["sample"], sub_matrix["group"], "\t".join(values) + ) + ) def save_matrix_values(self, file_name): # print a header telling the group names and their length - fh = open(file_name, 'wb') + fh = open(file_name, "wb") info = [] groups_len = np.diff(self.matrix.group_boundaries) for i in range(len(self.matrix.group_labels)): - info.append("{}:{}".format(self.matrix.group_labels[i], - groups_len[i])) + info.append("{}:{}".format(self.matrix.group_labels[i], groups_len[i])) fh.write(toBytes("#{}\n".format("\t".join(info)))) # add to header the x axis values - fh.write(toBytes("#downstream:{}\tupstream:{}\tbody:{}\tbin size:{}\tunscaled 5 prime:{}\tunscaled 3 prime:{}\n".format( - self.parameters['downstream'], - self.parameters['upstream'], - self.parameters['body'], - self.parameters['bin size'], - self.parameters.get('unscaled 5 prime', 0), - self.parameters.get('unscaled 3 prime', 0)))) + fh.write( + toBytes( + "#downstream:{}\tupstream:{}\tbody:{}\tbin size:{}\tunscaled 5 prime:{}\tunscaled 3 prime:{}\n".format( + self.parameters["downstream"], + self.parameters["upstream"], + self.parameters["body"], + self.parameters["bin size"], + self.parameters.get("unscaled 5 prime", 0), + self.parameters.get("unscaled 3 prime", 0), + ) + ) + ) sample_len = np.diff(self.matrix.sample_boundaries) for i in range(len(self.matrix.sample_labels)): info.extend([self.matrix.sample_labels[i]] * sample_len[i]) @@ -984,14 +1204,16 @@ def save_matrix_values(self, file_name): fh.close() # reopen again using append mode - fh = open(file_name, 'ab') + fh = open(file_name, "ab") np.savetxt(fh, self.matrix.matrix, fmt="%.4g", delimiter="\t") fh.close() def save_BED(self, file_handle): boundaries = np.array(self.matrix.group_boundaries) # Add a header - file_handle.write("#chrom\tstart\tend\tname\tscore\tstrand\tthickStart\tthickEnd\titemRGB\tblockCount\tblockSizes\tblockStart\tdeepTools_group") + file_handle.write( + "#chrom\tstart\tend\tname\tscore\tstrand\tthickStart\tthickEnd\titemRGB\tblockCount\tblockSizes\tblockStart\tdeepTools_group" + ) if self.matrix.silhouette is not None: file_handle.write("\tsilhouette") file_handle.write("\n") @@ -1008,26 +1230,30 @@ def save_BED(self, file_handle): starts = ",".join(starts) ends = ",".join(ends) file_handle.write( - '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{1}\t{2}\t0'.format( + "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{1}\t{2}\t0".format( region[0], region[1][0][0], region[1][-1][1], region[2], region[5], - region[4])) + region[4], + ) + ) file_handle.write( - '\t{0}\t{1}\t{2}\t{3}'.format( + "\t{0}\t{1}\t{2}\t{3}".format( len(region[1]), ",".join([str(int(y) - int(x)) for x, y in region[1]]), ",".join([str(int(x) - int(starts[0])) for x, y in region[1]]), - self.matrix.group_labels[label_idx])) + self.matrix.group_labels[label_idx], + ) + ) if self.matrix.silhouette is not None: file_handle.write("\t{}".format(self.matrix.silhouette[idx])) file_handle.write("\n") file_handle.close() @staticmethod - def matrix_avg(matrix, avgType='mean'): + def matrix_avg(matrix, avgType="mean"): matrix = np.ma.masked_invalid(matrix) return np.ma.__getattribute__(avgType)(matrix, axis=0) @@ -1042,7 +1268,7 @@ def get_individual_matrices(self, matrix): for i in range(0, num_cols, num_ind_cols): if i + num_ind_cols > num_cols: break - matrices_list.append(matrix[:, i:i + num_ind_cols]) + matrices_list.append(matrix[:, i : i + num_ind_cols]) return matrices_list def get_num_individual_matrix_cols(self): @@ -1053,8 +1279,13 @@ def get_num_individual_matrix_cols(self): of smaller matrices that are merged one after the other. """ - matrixCols = ((self.parameters['downstream'] + self.parameters['upstream'] + self.parameters['body'] + self.parameters['unscaled 5 prime'] + self.parameters['unscaled 3 prime']) // - self.parameters['bin size']) + matrixCols = ( + self.parameters["downstream"] + + self.parameters["upstream"] + + self.parameters["body"] + + self.parameters["unscaled 5 prime"] + + self.parameters["unscaled 3 prime"] + ) // self.parameters["bin size"] return matrixCols @@ -1064,8 +1295,8 @@ def computeSilhouetteScore(d, idx, labels): Given a square distance matrix with NaN diagonals, compute the silhouette score of a given row (idx). Each row should have an associated label (labels). """ - keep = ~np.isnan(d[idx, ]) - foo = np.bincount(labels[keep], weights=d[idx, ][keep]) + keep = ~np.isnan(d[idx,]) + foo = np.bincount(labels[keep], weights=d[idx,][keep]) groupSizes = np.bincount(labels[keep]) intraIdx = labels[idx] if groupSizes[intraIdx] == 1: @@ -1090,14 +1321,22 @@ class to hold heatmapper matrices This is an internal class of the heatmapper class """ - def __init__(self, regions, matrix, group_boundaries, sample_boundaries, - group_labels=None, sample_labels=None): - + def __init__( + self, + regions, + matrix, + group_boundaries, + sample_boundaries, + group_labels=None, + sample_labels=None, + ): # simple checks - assert matrix.shape[0] == group_boundaries[-1], \ - "row max do not match matrix shape" - assert matrix.shape[1] == sample_boundaries[-1], \ - "col max do not match matrix shape" + assert ( + matrix.shape[0] == group_boundaries[-1] + ), "row max do not match matrix shape" + assert ( + matrix.shape[1] == sample_boundaries[-1] + ), "col max do not match matrix shape" self.regions = regions self.matrix = matrix @@ -1108,19 +1347,23 @@ def __init__(self, regions, matrix, group_boundaries, sample_boundaries, self.silhouette = None if group_labels is None: - self.group_labels = ['group {}'.format(x) - for x in range(len(group_boundaries) - 1)] + self.group_labels = [ + "group {}".format(x) for x in range(len(group_boundaries) - 1) + ] else: - assert len(group_labels) == len(group_boundaries) - 1, \ - "number of group labels does not match number of groups" + assert ( + len(group_labels) == len(group_boundaries) - 1 + ), "number of group labels does not match number of groups" self.group_labels = group_labels if sample_labels is None: - self.sample_labels = ['sample {}'.format(x) - for x in range(len(sample_boundaries) - 1)] + self.sample_labels = [ + "sample {}".format(x) for x in range(len(sample_boundaries) - 1) + ] else: - assert len(sample_labels) == len(sample_boundaries) - 1, \ - "number of sample labels does not match number of samples" + assert ( + len(sample_labels) == len(sample_boundaries) - 1 + ), "number of sample labels does not match number of samples" self.sample_labels = sample_labels def get_matrix(self, group, sample): @@ -1140,9 +1383,13 @@ def get_matrix(self, group, sample): sample_start = self.sample_boundaries[sample] sample_end = self.sample_boundaries[sample + 1] - return {'matrix': np.ma.masked_invalid(self.matrix[group_start:group_end, :][:, sample_start:sample_end]), - 'group': self.group_labels[group], - 'sample': self.sample_labels[sample]} + return { + "matrix": np.ma.masked_invalid( + self.matrix[group_start:group_end, :][:, sample_start:sample_end] + ), + "group": self.group_labels[group], + "sample": self.sample_labels[sample], + } def get_num_samples(self): return len(self.sample_labels) @@ -1151,15 +1398,13 @@ def get_num_groups(self): return len(self.group_labels) def set_group_labels(self, new_labels): - """ sets new labels for groups - """ + """sets new labels for groups""" if len(new_labels) != len(self.group_labels): raise ValueError("length new labels != length original labels") self.group_labels = new_labels def set_sample_labels(self, new_labels): - """ sets new labels for groups - """ + """sets new labels for groups""" if len(new_labels) != len(self.sample_labels): raise ValueError("length new labels != length original labels") self.sample_labels = new_labels @@ -1190,19 +1435,22 @@ def get_regions(self): return regions - def sort_groups(self, sort_using='mean', sort_method='no', sample_list=None): + def sort_groups(self, sort_using="mean", sort_method="no", sample_list=None): """ Sorts and rearranges the submatrices according to the sorting method given. """ - if sort_method == 'no': + if sort_method == "no": return if (sample_list is not None) and (len(sample_list) > 0): # get the ids that correspond to the selected sample list idx_to_keep = [] for sample_idx in sample_list: - idx_to_keep += range(self.sample_boundaries[sample_idx], self.sample_boundaries[sample_idx + 1]) + idx_to_keep += range( + self.sample_boundaries[sample_idx], + self.sample_boundaries[sample_idx + 1], + ) matrix = self.matrix[:, idx_to_keep] @@ -1210,22 +1458,22 @@ def sort_groups(self, sort_using='mean', sort_method='no', sample_list=None): matrix = self.matrix # compute the row average: - if sort_using == 'region_length': + if sort_using == "region_length": matrix_avgs = list() for x in self.regions: matrix_avgs.append(np.sum([bar[1] - bar[0] for bar in x[1]])) matrix_avgs = np.array(matrix_avgs) - elif sort_using == 'mean': + elif sort_using == "mean": matrix_avgs = np.nanmean(matrix, axis=1) - elif sort_using == 'mean': + elif sort_using == "mean": matrix_avgs = np.nanmean(matrix, axis=1) - elif sort_using == 'median': + elif sort_using == "median": matrix_avgs = np.nanmedian(matrix, axis=1) - elif sort_using == 'max': + elif sort_using == "max": matrix_avgs = np.nanmax(matrix, axis=1) - elif sort_using == 'min': + elif sort_using == "min": matrix_avgs = np.nanmin(matrix, axis=1) - elif sort_using == 'sum': + elif sort_using == "sum": matrix_avgs = np.nansum(matrix, axis=1) else: sys.exit("{} is an unsupported sorting method".format(sort_using)) @@ -1237,7 +1485,7 @@ def sort_groups(self, sort_using='mean', sort_method='no', sample_list=None): start = self.group_boundaries[idx] end = self.group_boundaries[idx + 1] order = matrix_avgs[start:end].argsort() - if sort_method == 'descend': + if sort_method == "descend": order = order[::-1] _sorted_matrix.append(self.matrix[start:end, :][order, :]) # sort the regions @@ -1249,31 +1497,37 @@ def sort_groups(self, sort_using='mean', sort_method='no', sample_list=None): self.regions = _sorted_regions self.set_sorting_method(sort_method, sort_using) - def hmcluster(self, k, evaluate_silhouette=True, method='kmeans', clustering_samples=None): + def hmcluster( + self, k, evaluate_silhouette=True, method="kmeans", clustering_samples=None + ): matrix = np.asarray(self.matrix) matrix_to_cluster = matrix if clustering_samples is not None: - assert all(i > 0 for i in clustering_samples),\ - "all indices should be bigger than or equal to 1." - assert all(i <= len(self.sample_labels) for i in - clustering_samples),\ - "each index should be smaller than or equal to {}(total "\ + assert all( + i > 0 for i in clustering_samples + ), "all indices should be bigger than or equal to 1." + assert all(i <= len(self.sample_labels) for i in clustering_samples), ( + "each index should be smaller than or equal to {}(total " "number of samples.)".format(len(self.sample_labels)) + ) clustering_samples = np.asarray(clustering_samples) - 1 samples_cols = [] for idx in clustering_samples: - samples_cols += range(self.sample_boundaries[idx], - self.sample_boundaries[idx + 1]) + samples_cols += range( + self.sample_boundaries[idx], self.sample_boundaries[idx + 1] + ) matrix_to_cluster = matrix_to_cluster[:, samples_cols] if np.any(np.isnan(matrix_to_cluster)): # replace nans for 0 otherwise kmeans produces a weird behaviour - sys.stderr.write("*Warning* For clustering nan values have to be replaced by zeros \n") + sys.stderr.write( + "*Warning* For clustering nan values have to be replaced by zeros \n" + ) matrix_to_cluster[np.isnan(matrix_to_cluster)] = 0 - if method == 'kmeans': + if method == "kmeans": from scipy.cluster.vq import vq, kmeans centroids, _ = kmeans(matrix_to_cluster, k) @@ -1281,11 +1535,12 @@ def hmcluster(self, k, evaluate_silhouette=True, method='kmeans', clustering_sam # get the same cluster order cluster_labels, _ = vq(matrix_to_cluster, centroids) - if method == 'hierarchical': + if method == "hierarchical": # normally too slow for large data sets from scipy.cluster.hierarchy import fcluster, linkage - Z = linkage(matrix_to_cluster, method='ward', metric='euclidean') - cluster_labels = fcluster(Z, k, criterion='maxclust') + + Z = linkage(matrix_to_cluster, method="ward", metric="euclidean") + cluster_labels = fcluster(Z, k, criterion="maxclust") # hierarchical clustering labels from 1 .. k # while k-means labels 0 .. k -1 # Thus, for consistency, we subtract 1 @@ -1309,10 +1564,11 @@ def hmcluster(self, k, evaluate_silhouette=True, method='kmeans', clustering_sam cluster_number = 1 for cluster in cluster_order: cluster_ids = _cluster_ids_list[cluster] - self.group_labels.append("cluster_{}_n{}".format(cluster_number,len(cluster_ids))) + self.group_labels.append( + "cluster_{}_n{}".format(cluster_number, len(cluster_ids)) + ) cluster_number += 1 - self.group_boundaries.append(self.group_boundaries[-1] + - len(cluster_ids)) + self.group_boundaries.append(self.group_boundaries[-1] + len(cluster_ids)) _clustered_matrix.append(self.matrix[cluster_ids, :]) for idx in cluster_ids: _clustered_regions.append(self.regions[idx]) @@ -1327,7 +1583,9 @@ def computeSilhouette(self, k): from scipy.spatial.distance import pdist, squareform silhouette = np.repeat(0.0, self.group_boundaries[-1]) - groupSizes = np.subtract(self.group_boundaries[1:], self.group_boundaries[:-1]) + groupSizes = np.subtract( + self.group_boundaries[1:], self.group_boundaries[:-1] + ) labels = np.repeat(np.arange(k), groupSizes) d = pdist(self.matrix) @@ -1335,7 +1593,9 @@ def computeSilhouette(self, k): np.fill_diagonal(d2, np.nan) # This excludes the diagonal for idx in range(len(labels)): silhouette[idx] = computeSilhouetteScore(d2, idx, labels) - sys.stderr.write("The average silhouette score is: {}\n".format(np.mean(silhouette))) + sys.stderr.write( + "The average silhouette score is: {}\n".format(np.mean(silhouette)) + ) self.silhouette = silhouette def removeempty(self): @@ -1353,7 +1613,9 @@ def removeempty(self): self.matrix = self.matrix[to_keep, :] # adjust sample boundaries to_keep = np.array(to_keep) - self.group_boundaries = [len(to_keep[to_keep < x]) for x in self.group_boundaries] + self.group_boundaries = [ + len(to_keep[to_keep < x]) for x in self.group_boundaries + ] def flatten(self): """ @@ -1367,6 +1629,7 @@ def flatten(self): matrix_flatten = matrix_flatten[~np.isnan(matrix_flatten)] if len(matrix_flatten) == 0: num_nan = len(np.flatnonzero(np.isnan(self.matrix.flatten()))) - raise ValueError("matrix only contains nans " - "(total nans: {})".format(num_nan)) + raise ValueError( + "matrix only contains nans " "(total nans: {})".format(num_nan) + ) return matrix_flatten diff --git a/deeptools/heatmapper_utilities.py b/deeptools/heatmapper_utilities.py index e63dfb022..18cda971f 100644 --- a/deeptools/heatmapper_utilities.py +++ b/deeptools/heatmapper_utilities.py @@ -1,16 +1,17 @@ import numpy as np import matplotlib -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['svg.fonttype'] = 'none' + +matplotlib.use("Agg") +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["svg.fonttype"] = "none" from deeptools import cm # noqa: F401 import matplotlib.colors as pltcolors import plotly.graph_objs as go -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") -def plot_single(ax, ma, average_type, color, label, plot_type='lines'): +def plot_single(ax, ma, average_type, color, label, plot_type="lines"): """ Adds a line to the plot in the given ax using the specified method @@ -69,11 +70,11 @@ def plot_single(ax, ma, average_type, color, label, plot_type='lines'): if isinstance(color, np.ndarray): color = pltcolors.to_hex(color, keep_alpha=True) ax.plot(x, summary, color=color, label=label, alpha=0.9) - if plot_type == 'fill': - ax.fill_between(x, summary, facecolor=color, alpha=0.6, edgecolor='none') + if plot_type == "fill": + ax.fill_between(x, summary, facecolor=color, alpha=0.6, edgecolor="none") - if plot_type in ['se', 'std']: - if plot_type == 'se': # standard error + if plot_type in ["se", "std"]: + if plot_type == "se": # standard error std = np.std(ma, axis=0) / np.sqrt(ma.shape[0]) else: std = np.std(ma, axis=0) @@ -83,39 +84,49 @@ def plot_single(ax, ma, average_type, color, label, plot_type='lines'): # between the mean (or median etc.) and the std or se f_color = pltcolors.colorConverter.to_rgba(color, alpha) - ax.fill_between(x, summary, summary + std, facecolor=f_color, edgecolor='none') - ax.fill_between(x, summary, summary - std, facecolor=f_color, edgecolor='none') + ax.fill_between(x, summary, summary + std, facecolor=f_color, edgecolor="none") + ax.fill_between(x, summary, summary - std, facecolor=f_color, edgecolor="none") ax.set_xlim(0, max(x)) return ax -def plotly_single(ma, average_type, color, label, plot_type='line'): +def plotly_single(ma, average_type, color, label, plot_type="line"): """A plotly version of plot_single. Returns a list of traces""" summary = list(np.ma.__getattribute__(average_type)(ma, axis=0)) x = list(np.arange(len(summary))) if isinstance(color, str): color = list(matplotlib.colors.to_rgb(color)) - traces = [go.Scatter(x=x, y=summary, name=label, line={'color': "rgba({},{},{},0.9)".format(color[0], color[1], color[2])}, showlegend=False)] - if plot_type == 'fill': - traces[0].update(fill='tozeroy', fillcolor=color) - - if plot_type in ['se', 'std']: - if plot_type == 'se': # standard error + traces = [ + go.Scatter( + x=x, + y=summary, + name=label, + line={"color": "rgba({},{},{},0.9)".format(color[0], color[1], color[2])}, + showlegend=False, + ) + ] + if plot_type == "fill": + traces[0].update(fill="tozeroy", fillcolor=color) + + if plot_type in ["se", "std"]: + if plot_type == "se": # standard error std = np.std(ma, axis=0) / np.sqrt(ma.shape[0]) else: std = np.std(ma, axis=0) x_rev = x[::-1] lower = summary - std - trace = go.Scatter(x=x + x_rev, - y=np.concatenate([summary + std, lower[::-1]]), - fill='tozerox', - fillcolor="rgba({},{},{},0.2)".format(color[0], color[1], color[2]), - line=go.Line(color='transparent'), - showlegend=False, - name=label) + trace = go.Scatter( + x=x + x_rev, + y=np.concatenate([summary + std, lower[::-1]]), + fill="tozerox", + fillcolor="rgba({},{},{},0.2)".format(color[0], color[1], color[2]), + line=go.Line(color="transparent"), + showlegend=False, + name=label, + ) traces.append(trace) return traces @@ -133,42 +144,44 @@ def getProfileTicks(hm, referencePointLabel, startLabel, endLabel, idx): As of matplotlib 3.1 there is no longer padding added to all ticks. Reference point ticks will be adjusted by width/2 or width for spacing and the last half of scaled ticks will be shifed by 1 bin so the ticks are at the beginning of bins. """ - w = hm.parameters['bin size'] - b = hm.parameters['upstream'] - a = hm.parameters['downstream'] + w = hm.parameters["bin size"] + b = hm.parameters["upstream"] + a = hm.parameters["downstream"] if idx is not None: w = w[idx] b = b[idx] a = a[idx] try: - c = hm.parameters['unscaled 5 prime'] + c = hm.parameters["unscaled 5 prime"] if idx is not None: c = c[idx] except: c = 0 try: - d = hm.parameters['unscaled 3 prime'] + d = hm.parameters["unscaled 3 prime"] if idx is not None: d = d[idx] except: d = 0 - m = hm.parameters['body'] + m = hm.parameters["body"] if idx is not None: m = m[idx] if b < 1e5: quotient = 1000 - symbol = 'Kb' + symbol = "Kb" else: quotient = 1e6 - symbol = 'Mb' + symbol = "Mb" if m == 0: xticks = [(k / w) for k in [0, b - 0.5 * w, b + a - w]] - xtickslabel = ['{0:.1f}'.format(-(float(b) / quotient)), - referencePointLabel, - '{0:.1f}{1}'.format(float(a) / quotient, symbol)] + xtickslabel = [ + "{0:.1f}".format(-(float(b) / quotient)), + referencePointLabel, + "{0:.1f}{1}".format(float(a) / quotient, symbol), + ] else: xticks_values = [0] xtickslabel = [] @@ -176,7 +189,7 @@ def getProfileTicks(hm, referencePointLabel, startLabel, endLabel, idx): # only if upstream region is set, add a x tick if b > 0: xticks_values.append(b) - xtickslabel.append('{0:.1f}'.format(-(float(b) / quotient))) + xtickslabel.append("{0:.1f}".format(-(float(b) / quotient))) xtickslabel.append(startLabel) @@ -196,7 +209,7 @@ def getProfileTicks(hm, referencePointLabel, startLabel, endLabel, idx): if a > 0: xticks_values.append(b + c + m + d + a - w) - xtickslabel.append('{0:.1f}{1}'.format(float(a) / quotient, symbol)) + xtickslabel.append("{0:.1f}{1}".format(float(a) / quotient, symbol)) xticks = [(k / w) for k in xticks_values] xticks = [max(x, 0) for x in xticks] diff --git a/deeptools/mapReduce.py b/deeptools/mapReduce.py index af0b1647c..1f65e1326 100644 --- a/deeptools/mapReduce.py +++ b/deeptools/mapReduce.py @@ -5,19 +5,23 @@ debug = 0 -def mapReduce(staticArgs, func, chromSize, - genomeChunkLength=None, - region=None, - bedFile=None, - blackListFileName=None, - numberOfProcessors=4, - verbose=False, - includeLabels=False, - keepExons=False, - transcriptID="transcriptID", - exonID="exonID", - transcript_id_designator="transcript_id", - self_=None): +def mapReduce( + staticArgs, + func, + chromSize, + genomeChunkLength=None, + region=None, + bedFile=None, + blackListFileName=None, + numberOfProcessors=4, + verbose=False, + includeLabels=False, + keepExons=False, + transcriptID="transcriptID", + exonID="exonID", + transcript_id_designator="transcript_id", + self_=None, +): """ Split the genome into parts that are sent to workers using a defined number of procesors. Results are collected and returned. @@ -63,8 +67,9 @@ def mapReduce(staticArgs, func, chromSize, genomeChunkLength = int(genomeChunkLength) if verbose: - print("genome partition size for multiprocessing: {0}".format( - genomeChunkLength)) + print( + "genome partition size for multiprocessing: {0}".format(genomeChunkLength) + ) region_start = 0 region_end = None @@ -73,16 +78,29 @@ def mapReduce(staticArgs, func, chromSize, # the given genomic position if region: - chromSize, region_start, region_end, genomeChunkLength = getUserRegion(chromSize, region) + chromSize, region_start, region_end, genomeChunkLength = getUserRegion( + chromSize, region + ) if verbose: - print("chrom size: {0}, region start: {1}, region end: {2}, " - "genome chunk length sent to each procesor: {3}".format(chromSize, region_start, region_end, genomeChunkLength)) + print( + "chrom size: {0}, region start: {1}, region end: {2}, " + "genome chunk length sent to each procesor: {3}".format( + chromSize, region_start, region_end, genomeChunkLength + ) + ) if bedFile: defaultGroup = None if len(bedFile) == 1: defaultGroup = "genes" - bed_interval_tree = GTF(bedFile, defaultGroup=defaultGroup, transcriptID=transcriptID, exonID=exonID, transcript_id_designator=transcript_id_designator, keepExons=keepExons) + bed_interval_tree = GTF( + bedFile, + defaultGroup=defaultGroup, + transcriptID=transcriptID, + exonID=exonID, + transcript_id_designator=transcript_id_designator, + keepExons=keepExons, + ) if blackListFileName: blackList = GTF(blackListFileName) @@ -121,9 +139,28 @@ def mapReduce(staticArgs, func, chromSize, # TODO, there's no point in including the chromosome if includeLabels: - bed_regions_list = [[chrom, x[4], x[2], x[3], x[5], x[6]] for x in bed_interval_tree.findOverlaps(chrom, reg[0], reg[1], trimOverlap=True, numericGroups=True, includeStrand=True)] + bed_regions_list = [ + [chrom, x[4], x[2], x[3], x[5], x[6]] + for x in bed_interval_tree.findOverlaps( + chrom, + reg[0], + reg[1], + trimOverlap=True, + numericGroups=True, + includeStrand=True, + ) + ] else: - bed_regions_list = [[chrom, x[4], x[5], x[6]] for x in bed_interval_tree.findOverlaps(chrom, reg[0], reg[1], trimOverlap=True, includeStrand=True)] + bed_regions_list = [ + [chrom, x[4], x[5], x[6]] + for x in bed_interval_tree.findOverlaps( + chrom, + reg[0], + reg[1], + trimOverlap=True, + includeStrand=True, + ) + ] if len(bed_regions_list) == 0: continue @@ -134,9 +171,12 @@ def mapReduce(staticArgs, func, chromSize, if len(TASKS) > 1 and numberOfProcessors > 1: if verbose: - print(("using {} processors for {} " - "number of tasks".format(numberOfProcessors, - len(TASKS)))) + print( + ( + "using {} processors for {} " + "number of tasks".format(numberOfProcessors, len(TASKS)) + ) + ) random.shuffle(TASKS) pool = multiprocessing.Pool(numberOfProcessors) res = pool.map_async(func, TASKS).get(9999999) @@ -199,21 +239,28 @@ def getUserRegion(chrom_sizes, region_string, max_chunk_size=1e6): else: chromUse = "chr" + chrom if chromUse not in list(chrom_sizes.keys()): - raise NameError("Unknown chromosome: %s\nKnown " - "chromosomes are: %s " % (chrom, list(chrom_sizes.keys()))) + raise NameError( + "Unknown chromosome: %s\nKnown " + "chromosomes are: %s " % (chrom, list(chrom_sizes.keys())) + ) chrom = chromUse try: region_start = int(region[1]) except IndexError: region_start = 0 try: - region_end = int(region[2]) if int(region[2]) <= chrom_sizes[chrom] \ + region_end = ( + int(region[2]) + if int(region[2]) <= chrom_sizes[chrom] else chrom_sizes[chrom] + ) except IndexError: region_end = chrom_sizes[chrom] if region_start > region_end or region_start < 0: - raise NameError("{} not valid. The format is chrom:start:end. " - "Without comas, dashes or dots. ".format(region_string)) + raise NameError( + "{} not valid. The format is chrom:start:end. " + "Without comas, dashes or dots. ".format(region_string) + ) try: tilesize = int(region[3]) except IndexError: diff --git a/deeptools/misc.py b/deeptools/misc.py index f20b22f14..14f598c29 100644 --- a/deeptools/misc.py +++ b/deeptools/misc.py @@ -3,11 +3,11 @@ # This should force numpy to run single threaded. See issue #697 # This module MUST be imported before numpy # Note that these environment variables are internal to deepTools (they won't exist on the shell after the command completes) -if 'MKL_NUM_THREADS' not in os.environ: - os.environ['MKL_NUM_THREADS'] = 'sequential' -if 'NUMEXPR_NUM_THREADS' not in os.environ: - os.environ['NUMEXPR_NUM_THREADS'] = '1' -if 'OMP_NUM_THREADS' not in os.environ: - os.environ['OMP_NUM_THREADS'] = '1' -if 'VECLIB_MAXIMUM_THREADS' not in os.environ: - os.environ['VECLIB_MAXIMUM_THREADS'] = '1' +if "MKL_NUM_THREADS" not in os.environ: + os.environ["MKL_NUM_THREADS"] = "sequential" +if "NUMEXPR_NUM_THREADS" not in os.environ: + os.environ["NUMEXPR_NUM_THREADS"] = "1" +if "OMP_NUM_THREADS" not in os.environ: + os.environ["OMP_NUM_THREADS"] = "1" +if "VECLIB_MAXIMUM_THREADS" not in os.environ: + os.environ["VECLIB_MAXIMUM_THREADS"] = "1" diff --git a/deeptools/multiBamSummary.py b/deeptools/multiBamSummary.py index a588c09c9..e676e59be 100644 --- a/deeptools/multiBamSummary.py +++ b/deeptools/multiBamSummary.py @@ -11,14 +11,13 @@ from deeptools.utilities import smartLabels from deeptools._version import __version__ -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def parse_arguments(args=None): - parser = \ - argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description=""" + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=""" ``multiBamSummary`` computes the read coverages for genomic regions for typically two or more BAM files. The analysis can be performed for the entire genome by running the program in 'bins' mode. @@ -36,155 +35,189 @@ def parse_arguments(args=None): """, - epilog='example usages:\n' - 'multiBamSummary bins --bamfiles file1.bam file2.bam -o results.npz \n\n' - 'multiBamSummary BED-file --BED selection.bed --bamfiles file1.bam file2.bam \n' - '-o results.npz' - ' \n\n', - conflict_handler='resolve') - - parser.add_argument('--version', action='version', - version='%(prog)s {}'.format(__version__)) + epilog="example usages:\n" + "multiBamSummary bins --bamfiles file1.bam file2.bam -o results.npz \n\n" + "multiBamSummary BED-file --BED selection.bed --bamfiles file1.bam file2.bam \n" + "-o results.npz" + " \n\n", + conflict_handler="resolve", + ) + + parser.add_argument( + "--version", action="version", version="%(prog)s {}".format(__version__) + ) subparsers = parser.add_subparsers( title="commands", - dest='command', - description='subcommands', - help='subcommands', - metavar='') + dest="command", + description="subcommands", + help="subcommands", + metavar="", + ) parent_parser = parserCommon.getParentArgParse(binSize=False) read_options_parser = parserCommon.read_options() # bins mode options subparsers.add_parser( - 'bins', + "bins", formatter_class=argparse.ArgumentDefaultsHelpFormatter, - parents=[bamcorrelate_args(case='bins'), - parent_parser, read_options_parser, - parserCommon.gtf_options(suppress=True) - ], + parents=[ + bamcorrelate_args(case="bins"), + parent_parser, + read_options_parser, + parserCommon.gtf_options(suppress=True), + ], help="The coverage calculation is done for consecutive bins of equal " - "size (10 kilobases by default). This mode is useful to assess the " - "genome-wide similarity of BAM files. The bin size and " - "distance between bins can be adjusted.", + "size (10 kilobases by default). This mode is useful to assess the " + "genome-wide similarity of BAM files. The bin size and " + "distance between bins can be adjusted.", add_help=False, - usage='%(prog)s ' - '--bamfiles file1.bam file2.bam ' - '-o results.npz \n') + usage="%(prog)s " "--bamfiles file1.bam file2.bam " "-o results.npz \n", + ) # BED file arguments subparsers.add_parser( - 'BED-file', + "BED-file", formatter_class=argparse.ArgumentDefaultsHelpFormatter, - parents=[bamcorrelate_args(case='BED-file'), - parent_parser, read_options_parser, - parserCommon.gtf_options() - ], + parents=[ + bamcorrelate_args(case="BED-file"), + parent_parser, + read_options_parser, + parserCommon.gtf_options(), + ], help="The user provides a BED file that contains all regions " - "that should be considered for the coverage analysis. A " - "common use is to compare ChIP-seq coverages between two " - "different samples for a set of peak regions.", - usage='%(prog)s --BED selection.bed --bamfiles file1.bam file2.bam -o results.npz\n', - add_help=False) + "that should be considered for the coverage analysis. A " + "common use is to compare ChIP-seq coverages between two " + "different samples for a set of peak regions.", + usage="%(prog)s --BED selection.bed --bamfiles file1.bam file2.bam -o results.npz\n", + add_help=False, + ) return parser -def bamcorrelate_args(case='bins'): +def bamcorrelate_args(case="bins"): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") # define the arguments - required.add_argument('--bamfiles', '-b', - metavar='FILE1 FILE2', - help='List of indexed bam files separated by spaces.', - nargs='+', - required=True) - - required.add_argument('--outFileName', '-out', '-o', - help='File name to save the coverage matrix. This matrix ' - 'can be subsequently plotted using plotCorrelation or ' - 'or plotPCA.', - type=parserCommon.writableFile) - - optional = parser.add_argument_group('Optional arguments') - - optional.add_argument("--help", "-h", action="help", - help="show this help message and exit") - optional.add_argument('--labels', '-l', - metavar='sample1 sample2', - help='User defined labels instead of default labels from ' - 'file names. ' - 'Multiple labels have to be separated by a space, e.g. ' - '--labels sample1 sample2 sample3', - nargs='+') - optional.add_argument('--smartLabels', - action='store_true', - help='Instead of manually specifying labels for the input ' - 'BAM files, this causes deepTools to use the file name ' - 'after removing the path and extension.') - - optional.add_argument('--genomeChunkSize', - type=int, - default=None, - help='Manually specify the size of the genome provided to each processor. ' - 'The default value of None specifies that this is determined by read ' - 'density of the BAM file.') - - if case == 'bins': - optional.add_argument('--binSize', '-bs', - metavar='INT', - help='Length in bases of the window used ' - 'to sample the genome. (Default: %(default)s)', - default=10000, - type=int) - - optional.add_argument('--distanceBetweenBins', '-n', - metavar='INT', - help='By default, multiBamSummary considers consecutive ' - 'bins of the specified --binSize. However, to ' - 'reduce the computation time, a larger distance ' - 'between bins can by given. Larger distances ' - 'result in fewer bins considered. (Default: %(default)s)', - default=0, - type=int) - - required.add_argument('--BED', - help=argparse.SUPPRESS, - default=None) + required.add_argument( + "--bamfiles", + "-b", + metavar="FILE1 FILE2", + help="List of indexed bam files separated by spaces.", + nargs="+", + required=True, + ) + + required.add_argument( + "--outFileName", + "-out", + "-o", + help="File name to save the coverage matrix. This matrix " + "can be subsequently plotted using plotCorrelation or " + "or plotPCA.", + type=parserCommon.writableFile, + ) + + optional = parser.add_argument_group("Optional arguments") + + optional.add_argument( + "--help", "-h", action="help", help="show this help message and exit" + ) + optional.add_argument( + "--labels", + "-l", + metavar="sample1 sample2", + help="User defined labels instead of default labels from " + "file names. " + "Multiple labels have to be separated by a space, e.g. " + "--labels sample1 sample2 sample3", + nargs="+", + ) + optional.add_argument( + "--smartLabels", + action="store_true", + help="Instead of manually specifying labels for the input " + "BAM files, this causes deepTools to use the file name " + "after removing the path and extension.", + ) + + optional.add_argument( + "--genomeChunkSize", + type=int, + default=None, + help="Manually specify the size of the genome provided to each processor. " + "The default value of None specifies that this is determined by read " + "density of the BAM file.", + ) + + if case == "bins": + optional.add_argument( + "--binSize", + "-bs", + metavar="INT", + help="Length in bases of the window used " + "to sample the genome. (Default: %(default)s)", + default=10000, + type=int, + ) + + optional.add_argument( + "--distanceBetweenBins", + "-n", + metavar="INT", + help="By default, multiBamSummary considers consecutive " + "bins of the specified --binSize. However, to " + "reduce the computation time, a larger distance " + "between bins can by given. Larger distances " + "result in fewer bins considered. (Default: %(default)s)", + default=0, + type=int, + ) + + required.add_argument("--BED", help=argparse.SUPPRESS, default=None) else: - optional.add_argument('--binSize', '-bs', - help=argparse.SUPPRESS, - default=10000, - type=int) - - optional.add_argument('--distanceBetweenBins', '-n', - help=argparse.SUPPRESS, - metavar='INT', - default=0, - type=int) - - required.add_argument('--BED', - help='Limits the coverage analysis to ' - 'the regions specified in these files.', - metavar='FILE1.bed FILE2.bed', - nargs='+', - required=True) - - group = parser.add_argument_group('Output optional options') - - group.add_argument('--outRawCounts', - help='Save the counts per region to a tab-delimited file.', - type=parserCommon.writableFile, - metavar='FILE') - - group.add_argument('--scalingFactors', - help='Compute scaling factors (in the DESeq2 manner) ' - 'compatible for use with bamCoverage and write them to a ' - 'file. The file has tab-separated columns "sample" and ' - '"scalingFactor".', - type=parserCommon.writableFile, - metavar='FILE') + optional.add_argument( + "--binSize", "-bs", help=argparse.SUPPRESS, default=10000, type=int + ) + + optional.add_argument( + "--distanceBetweenBins", + "-n", + help=argparse.SUPPRESS, + metavar="INT", + default=0, + type=int, + ) + + required.add_argument( + "--BED", + help="Limits the coverage analysis to " + "the regions specified in these files.", + metavar="FILE1.bed FILE2.bed", + nargs="+", + required=True, + ) + + group = parser.add_argument_group("Output optional options") + + group.add_argument( + "--outRawCounts", + help="Save the counts per region to a tab-delimited file.", + type=parserCommon.writableFile, + metavar="FILE", + ) + + group.add_argument( + "--scalingFactors", + help="Compute scaling factors (in the DESeq2 manner) " + "compatible for use with bamCoverage and write them to a " + 'file. The file has tab-separated columns "sample" and ' + '"scalingFactor".', + type=parserCommon.writableFile, + metavar="FILE", + ) return parser @@ -214,15 +247,17 @@ def main(args=None): """ args = process_args(args) - if 'BED' in args: + if "BED" in args: bed_regions = args.BED else: bed_regions = None if len(args.bamfiles) == 1 and not (args.outRawCounts or args.scalingFactors): - sys.stderr.write("You've input a single BAM file and not specified " - "--outRawCounts or --scalingFactors. The resulting output will NOT be " - "useful with any deepTools program!\n") + sys.stderr.write( + "You've input a single BAM file and not specified " + "--outRawCounts or --scalingFactors. The resulting output will NOT be " + "useful with any deepTools program!\n" + ) stepsize = args.binSize + args.distanceBetweenBins c = countR.CountReadsPerBin( @@ -245,28 +280,28 @@ def main(args=None): maxFragmentLength=args.maxFragmentLength, stepSize=stepsize, zerosToNans=False, - out_file_for_raw_data=args.outRawCounts) + out_file_for_raw_data=args.outRawCounts, + ) num_reads_per_bin = c.run(allArgs=args) - sys.stderr.write("Number of bins " - "found: {}\n".format(num_reads_per_bin.shape[0])) + sys.stderr.write("Number of bins " "found: {}\n".format(num_reads_per_bin.shape[0])) if num_reads_per_bin.shape[0] < 2: - exit("ERROR: too few non zero bins found.\n" - "If using --region please check that this " - "region is covered by reads.\n") + exit( + "ERROR: too few non zero bins found.\n" + "If using --region please check that this " + "region is covered by reads.\n" + ) # numpy will append .npz to the file name if we don't do this... if args.outFileName: f = open(args.outFileName, "wb") - np.savez_compressed(f, - matrix=num_reads_per_bin, - labels=args.labels) + np.savez_compressed(f, matrix=num_reads_per_bin, labels=args.labels) f.close() if args.scalingFactors: - f = open(args.scalingFactors, 'w') + f = open(args.scalingFactors, "w") f.write("sample\tscalingFactor\n") scalingFactors = countR.estimateSizeFactors(num_reads_per_bin) for sample, scalingFactor in zip(args.labels, scalingFactors): @@ -278,7 +313,7 @@ def main(args=None): # labels header = "#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" - f = open(args.outRawCounts, 'r+') + f = open(args.outRawCounts, "r+") content = f.read() f.seek(0, 0) f.write(header + content) diff --git a/deeptools/multiBigwigSummary.py b/deeptools/multiBigwigSummary.py index 8d89421f1..12ce80dc0 100644 --- a/deeptools/multiBigwigSummary.py +++ b/deeptools/multiBigwigSummary.py @@ -12,14 +12,13 @@ import deeptools.getScorePerBigWigBin as score_bw import deeptools.deepBlue as db -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def parse_arguments(args=None): - parser = \ - argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description=""" + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=""" Given typically two or more bigWig files, ``multiBigwigSummary`` computes the average scores for each of the files in every genomic region. This analysis is performed for the entire genome by running the program in ``bins`` mode, or for certain user selected regions in ``BED-file`` @@ -35,59 +34,62 @@ def parse_arguments(args=None): """, - epilog='example usage:\n multiBigwigSummary bins ' - '-b file1.bw file2.bw -o results.npz\n\n' - 'multiBigwigSummary BED-file -b file1.bw file2.bw -o results.npz\n' - '--BED selection.bed' - ' \n\n', - conflict_handler='resolve') - - parser.add_argument('--version', action='version', - version='multiBigwigSummary {}'.format(__version__)) - subparsers = parser.add_subparsers( - title="commands", - dest='command', - metavar='') + epilog="example usage:\n multiBigwigSummary bins " + "-b file1.bw file2.bw -o results.npz\n\n" + "multiBigwigSummary BED-file -b file1.bw file2.bw -o results.npz\n" + "--BED selection.bed" + " \n\n", + conflict_handler="resolve", + ) + + parser.add_argument( + "--version", + action="version", + version="multiBigwigSummary {}".format(__version__), + ) + subparsers = parser.add_subparsers(title="commands", dest="command", metavar="") parent_parser = parserCommon.getParentArgParse(binSize=False) dbParser = parserCommon.deepBlueOptionalArgs() # bins mode options subparsers.add_parser( - 'bins', + "bins", formatter_class=argparse.ArgumentDefaultsHelpFormatter, - parents=[multiBigwigSummaryArgs(case='bins'), - parent_parser, - parserCommon.gtf_options(suppress=True), - dbParser - ], + parents=[ + multiBigwigSummaryArgs(case="bins"), + parent_parser, + parserCommon.gtf_options(suppress=True), + dbParser, + ], help="The average score is based on equally sized bins " - "(10 kilobases by default), which consecutively cover the " - "entire genome. The only exception is the last bin of a chromosome, which " - "is often smaller. The output of this mode is commonly used to assess the " - "overall similarity of different bigWig files.", + "(10 kilobases by default), which consecutively cover the " + "entire genome. The only exception is the last bin of a chromosome, which " + "is often smaller. The output of this mode is commonly used to assess the " + "overall similarity of different bigWig files.", add_help=False, - usage='multiBigwigSummary bins ' - '-b file1.bw file2.bw ' - '-o results.npz\n') + usage="multiBigwigSummary bins " "-b file1.bw file2.bw " "-o results.npz\n", + ) # BED file arguments subparsers.add_parser( - 'BED-file', + "BED-file", formatter_class=argparse.ArgumentDefaultsHelpFormatter, - parents=[multiBigwigSummaryArgs(case='BED-file'), - parent_parser, - parserCommon.gtf_options(), - dbParser - ], + parents=[ + multiBigwigSummaryArgs(case="BED-file"), + parent_parser, + parserCommon.gtf_options(), + dbParser, + ], help="The user provides a BED file that contains all regions " - "that should be considered for the analysis. A " - "common use is to compare scores (e.g. ChIP-seq scores) between " - "different samples over a set of pre-defined peak regions.", - usage='multiBigwigSummary BED-file ' - '-b file1.bw file2.bw ' - '-o results.npz --BED selection.bed\n', - add_help=False) + "that should be considered for the analysis. A " + "common use is to compare scores (e.g. ChIP-seq scores) between " + "different samples over a set of pre-defined peak regions.", + usage="multiBigwigSummary BED-file " + "-b file1.bw file2.bw " + "-o results.npz --BED selection.bed\n", + add_help=False, + ) return parser @@ -100,7 +102,11 @@ def process_args(args=None): elif not args.labels: args.labels = [] for f in args.bwfiles: - if f.startswith("http://") or f.startswith("https://") or f.startswith("ftp://"): + if ( + f.startswith("http://") + or f.startswith("https://") + or f.startswith("ftp://") + ): args.labels.append(f.split("/")[-1]) else: args.labels.append(os.path.basename(f)) @@ -111,92 +117,116 @@ def process_args(args=None): return args -def multiBigwigSummaryArgs(case='bins'): +def multiBigwigSummaryArgs(case="bins"): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") # define the arguments - required.add_argument('--bwfiles', '-b', - metavar='FILE1 FILE2', - help='List of bigWig files, separated by spaces.', - nargs='+', - required=True) - - required.add_argument('--outFileName', '-out', '-o', - help='File name to save the compressed matrix file (npz format) ' - 'needed by the "plotPCA" and "plotCorrelation" tools.', - type=parserCommon.writableFile, - required=True) - - optional = parser.add_argument_group('Optional arguments') - - optional.add_argument("--help", "-h", action="help", - help="show this help message and exit") - optional.add_argument('--labels', '-l', - metavar='sample1 sample2', - help='User defined labels instead of default labels from ' - 'file names. ' - 'Multiple labels have to be separated by spaces, e.g., ' - '--labels sample1 sample2 sample3', - nargs='+') - optional.add_argument('--smartLabels', - action='store_true', - help='Instead of manually specifying labels for the input ' - 'bigWig files, this causes deepTools to use the file name ' - 'after removing the path and extension.') - - optional.add_argument('--chromosomesToSkip', - metavar='chr1 chr2', - help='List of chromosomes that you do not want to be included. ' - ' Useful to remove "random" or "extra" chr.', - nargs='+') - - if case == 'bins': - optional.add_argument('--binSize', '-bs', - metavar='INT', - help='Size (in bases) of the windows sampled ' - 'from the genome. (Default: %(default)s)', - default=10000, - type=int) - - optional.add_argument('--distanceBetweenBins', '-n', - metavar='INT', - help='By default, multiBigwigSummary considers adjacent ' - 'bins of the specified --binSize. However, to ' - 'reduce the computation time, a larger distance ' - 'between bins can be given. Larger distances ' - 'results in fewer considered bins. (Default: %(default)s)', - default=0, - type=int) - - required.add_argument('--BED', - help=argparse.SUPPRESS, - default=None) + required.add_argument( + "--bwfiles", + "-b", + metavar="FILE1 FILE2", + help="List of bigWig files, separated by spaces.", + nargs="+", + required=True, + ) + + required.add_argument( + "--outFileName", + "-out", + "-o", + help="File name to save the compressed matrix file (npz format) " + 'needed by the "plotPCA" and "plotCorrelation" tools.', + type=parserCommon.writableFile, + required=True, + ) + + optional = parser.add_argument_group("Optional arguments") + + optional.add_argument( + "--help", "-h", action="help", help="show this help message and exit" + ) + optional.add_argument( + "--labels", + "-l", + metavar="sample1 sample2", + help="User defined labels instead of default labels from " + "file names. " + "Multiple labels have to be separated by spaces, e.g., " + "--labels sample1 sample2 sample3", + nargs="+", + ) + optional.add_argument( + "--smartLabels", + action="store_true", + help="Instead of manually specifying labels for the input " + "bigWig files, this causes deepTools to use the file name " + "after removing the path and extension.", + ) + + optional.add_argument( + "--chromosomesToSkip", + metavar="chr1 chr2", + help="List of chromosomes that you do not want to be included. " + ' Useful to remove "random" or "extra" chr.', + nargs="+", + ) + + if case == "bins": + optional.add_argument( + "--binSize", + "-bs", + metavar="INT", + help="Size (in bases) of the windows sampled " + "from the genome. (Default: %(default)s)", + default=10000, + type=int, + ) + + optional.add_argument( + "--distanceBetweenBins", + "-n", + metavar="INT", + help="By default, multiBigwigSummary considers adjacent " + "bins of the specified --binSize. However, to " + "reduce the computation time, a larger distance " + "between bins can be given. Larger distances " + "results in fewer considered bins. (Default: %(default)s)", + default=0, + type=int, + ) + + required.add_argument("--BED", help=argparse.SUPPRESS, default=None) else: - optional.add_argument('--binSize', '-bs', - help=argparse.SUPPRESS, - default=10000, - type=int) - - optional.add_argument('--distanceBetweenBins', '-n', - help=argparse.SUPPRESS, - metavar='INT', - default=0, - type=int) - - required.add_argument('--BED', - help='Limits the analysis to ' - 'the regions specified in this file.', - metavar='file1.bed file2.bed', - nargs='+', - required=True) - - group = parser.add_argument_group('Output optional options') - - group.add_argument('--outRawCounts', - help='Save average scores per region for each bigWig file to a single tab-delimited file.', - type=parserCommon.writableFile, - metavar='FILE') + optional.add_argument( + "--binSize", "-bs", help=argparse.SUPPRESS, default=10000, type=int + ) + + optional.add_argument( + "--distanceBetweenBins", + "-n", + help=argparse.SUPPRESS, + metavar="INT", + default=0, + type=int, + ) + + required.add_argument( + "--BED", + help="Limits the analysis to " "the regions specified in this file.", + metavar="file1.bed file2.bed", + nargs="+", + required=True, + ) + + group = parser.add_argument_group("Output optional options") + + group.add_argument( + "--outRawCounts", + help="Save average scores per region for each bigWig file to a single tab-delimited file.", + type=parserCommon.writableFile, + metavar="FILE", + ) return parser @@ -211,15 +241,17 @@ def main(args=None): """ args = process_args(args) - if 'BED' in args: + if "BED" in args: bed_regions = args.BED else: bed_regions = None if len(args.bwfiles) == 1 and not args.outRawCounts: - sys.stderr.write("You've input a single bigWig file and not specified " - "--outRawCounts. The resulting output will NOT be " - "useful with any deepTools program!\n") + sys.stderr.write( + "You've input a single bigWig file and not specified " + "--outRawCounts. The resulting output will NOT be " + "useful with any deepTools program!\n" + ) # Preload deepBlue files, which need to then be deleted deepBlueFiles = [] @@ -227,11 +259,17 @@ def main(args=None): if db.isDeepBlue(fname): deepBlueFiles.append([fname, idx]) if len(deepBlueFiles) > 0: - sys.stderr.write("Preloading the following deepBlue files: {}\n".format(",".join([x[0] for x in deepBlueFiles]))) - if 'BED' in args: + sys.stderr.write( + "Preloading the following deepBlue files: {}\n".format( + ",".join([x[0] for x in deepBlueFiles]) + ) + ) + if "BED" in args: regs = db.makeRegions(args.BED, args) else: - foo = db.deepBlue(deepBlueFiles[0][0], url=args.deepBlueURL, userKey=args.userKey) + foo = db.deepBlue( + deepBlueFiles[0][0], url=args.deepBlueURL, userKey=args.userKey + ) regs = db.makeTiles(foo, args) del foo for x in deepBlueFiles: @@ -243,7 +281,7 @@ def main(args=None): res = list(map(db.preloadWrapper, deepBlueFiles)) # substitute the file names with the temp files - for (ftuple, r) in zip(deepBlueFiles, res): + for ftuple, r in zip(deepBlueFiles, res): args.bwfiles[ftuple[1]] = r deepBlueFiles = [[x[0], x[1]] for x in deepBlueFiles] del regs @@ -259,20 +297,20 @@ def main(args=None): bedFile=bed_regions, chrsToSkip=args.chromosomesToSkip, out_file_for_raw_data=args.outRawCounts, - allArgs=args) + allArgs=args, + ) - sys.stderr.write("Number of bins " - "found: {}\n".format(num_reads_per_bin.shape[0])) + sys.stderr.write("Number of bins " "found: {}\n".format(num_reads_per_bin.shape[0])) if num_reads_per_bin.shape[0] < 2: - exit("ERROR: too few non zero bins found.\n" - "If using --region please check that this " - "region is covered by reads.\n") + exit( + "ERROR: too few non zero bins found.\n" + "If using --region please check that this " + "region is covered by reads.\n" + ) f = open(args.outFileName, "wb") - np.savez_compressed(f, - matrix=num_reads_per_bin, - labels=args.labels) + np.savez_compressed(f, matrix=num_reads_per_bin, labels=args.labels) f.close() if args.outRawCounts: diff --git a/deeptools/parserCommon.py b/deeptools/parserCommon.py index ef4f4d074..d170903af 100755 --- a/deeptools/parserCommon.py +++ b/deeptools/parserCommon.py @@ -6,7 +6,10 @@ def check_float_0_1(value): v = float(value) if v < 0.0 or v > 1.0: - raise argparse.ArgumentTypeError("%s is an invalid floating point value. It must be between 0.0 and 1.0" % value) + raise argparse.ArgumentTypeError( + "%s is an invalid floating point value. It must be between 0.0 and 1.0" + % value + ) return v @@ -16,24 +19,33 @@ def check_list_of_comma_values(value): for foo in value: foo = value.split(",") if len(foo) < 2: - raise argparse.ArgumentTypeError("%s is an invalid element of a list of comma separated values. " - "Only argument elements of the following form are accepted: 'foo,bar'" % foo) + raise argparse.ArgumentTypeError( + "%s is an invalid element of a list of comma separated values. " + "Only argument elements of the following form are accepted: 'foo,bar'" + % foo + ) return value def output(args=None): parser = argparse.ArgumentParser(add_help=False) - group = parser.add_argument_group('Output') - group.add_argument('--outFileName', '-o', - help='Output file name.', - metavar='FILENAME', - type=writableFile, - required=True) - - group.add_argument('--outFileFormat', '-of', - help='Output file type. Either "bigwig" or "bedgraph".', - choices=['bigwig', 'bedgraph'], - default='bigwig') + group = parser.add_argument_group("Output") + group.add_argument( + "--outFileName", + "-o", + help="Output file name.", + metavar="FILENAME", + type=writableFile, + required=True, + ) + + group.add_argument( + "--outFileFormat", + "-of", + help='Output file type. Either "bigwig" or "bedgraph".', + choices=["bigwig", "bedgraph"], + default="bigwig", + ) return parser @@ -43,96 +55,111 @@ def read_options(): of the read coverage """ parser = argparse.ArgumentParser(add_help=False) - group = parser.add_argument_group('Read processing options') - - group.add_argument('--extendReads', '-e', - help='This parameter allows the extension of reads to ' - 'fragment size. If set, each read is ' - 'extended, without exception.\n' - '*NOTE*: This feature is generally NOT recommended for ' - 'spliced-read data, such as RNA-seq, as it would ' - 'extend reads over skipped regions.\n' - '*Single-end*: Requires a user specified value for the ' - 'final fragment length. Reads that already exceed this ' - 'fragment length will not be extended.\n' - '*Paired-end*: Reads with mates are always extended to ' - 'match the fragment size defined by the two read mates. ' - 'Unmated reads, mate reads that map too far apart ' - '(>4x fragment length) or even map to different ' - 'chromosomes are treated like single-end reads. The input ' - 'of a fragment length value is optional. If ' - 'no value is specified, it is estimated from the ' - 'data (mean of the fragment size of all mate reads).\n', - type=int, - nargs='?', - const=True, - default=False, - metavar="INT bp") - - group.add_argument('--ignoreDuplicates', - help='If set, reads that have the same orientation ' - 'and start position will be considered only ' - 'once. If reads are paired, the mate\'s position ' - 'also has to coincide to ignore a read.', - action='store_true' - ) - - group.add_argument('--minMappingQuality', - metavar='INT', - help='If set, only reads that have a mapping ' - 'quality score of at least this are ' - 'considered.', - type=int, - ) - - group.add_argument('--centerReads', - help='By adding this option, reads are centered with ' - 'respect to the fragment length. For paired-end data, ' - 'the read is centered at the fragment length defined ' - 'by the two ends of the fragment. For single-end data, the ' - 'given fragment length is used. This option is ' - 'useful to get a sharper signal around enriched ' - 'regions.', - action='store_true') - - group.add_argument('--samFlagInclude', - help='Include reads based on the SAM flag. For example, ' - 'to get only reads that are the first mate, use a flag of 64. ' - 'This is useful to count properly paired reads only once, ' - 'as otherwise the second mate will be also considered for the ' - 'coverage. (Default: %(default)s)', - metavar='INT', - default=None, - type=int, - required=False) - - group.add_argument('--samFlagExclude', - help='Exclude reads based on the SAM flag. For example, ' - 'to get only reads that map to the forward strand, use ' - '--samFlagExclude 16, where 16 is the SAM flag for reads ' - 'that map to the reverse strand. (Default: %(default)s)', - metavar='INT', - default=None, - type=int, - required=False) - - group.add_argument('--minFragmentLength', - help='The minimum fragment length needed for read/pair ' - 'inclusion. This option is primarily useful ' - 'in ATACseq experiments, for filtering mono- or ' - 'di-nucleosome fragments. (Default: %(default)s)', - metavar='INT', - default=0, - type=int, - required=False) - - group.add_argument('--maxFragmentLength', - help='The maximum fragment length needed for read/pair ' - 'inclusion. (Default: %(default)s)', - metavar='INT', - default=0, - type=int, - required=False) + group = parser.add_argument_group("Read processing options") + + group.add_argument( + "--extendReads", + "-e", + help="This parameter allows the extension of reads to " + "fragment size. If set, each read is " + "extended, without exception.\n" + "*NOTE*: This feature is generally NOT recommended for " + "spliced-read data, such as RNA-seq, as it would " + "extend reads over skipped regions.\n" + "*Single-end*: Requires a user specified value for the " + "final fragment length. Reads that already exceed this " + "fragment length will not be extended.\n" + "*Paired-end*: Reads with mates are always extended to " + "match the fragment size defined by the two read mates. " + "Unmated reads, mate reads that map too far apart " + "(>4x fragment length) or even map to different " + "chromosomes are treated like single-end reads. The input " + "of a fragment length value is optional. If " + "no value is specified, it is estimated from the " + "data (mean of the fragment size of all mate reads).\n", + type=int, + nargs="?", + const=True, + default=False, + metavar="INT bp", + ) + + group.add_argument( + "--ignoreDuplicates", + help="If set, reads that have the same orientation " + "and start position will be considered only " + "once. If reads are paired, the mate's position " + "also has to coincide to ignore a read.", + action="store_true", + ) + + group.add_argument( + "--minMappingQuality", + metavar="INT", + help="If set, only reads that have a mapping " + "quality score of at least this are " + "considered.", + type=int, + ) + + group.add_argument( + "--centerReads", + help="By adding this option, reads are centered with " + "respect to the fragment length. For paired-end data, " + "the read is centered at the fragment length defined " + "by the two ends of the fragment. For single-end data, the " + "given fragment length is used. This option is " + "useful to get a sharper signal around enriched " + "regions.", + action="store_true", + ) + + group.add_argument( + "--samFlagInclude", + help="Include reads based on the SAM flag. For example, " + "to get only reads that are the first mate, use a flag of 64. " + "This is useful to count properly paired reads only once, " + "as otherwise the second mate will be also considered for the " + "coverage. (Default: %(default)s)", + metavar="INT", + default=None, + type=int, + required=False, + ) + + group.add_argument( + "--samFlagExclude", + help="Exclude reads based on the SAM flag. For example, " + "to get only reads that map to the forward strand, use " + "--samFlagExclude 16, where 16 is the SAM flag for reads " + "that map to the reverse strand. (Default: %(default)s)", + metavar="INT", + default=None, + type=int, + required=False, + ) + + group.add_argument( + "--minFragmentLength", + help="The minimum fragment length needed for read/pair " + "inclusion. This option is primarily useful " + "in ATACseq experiments, for filtering mono- or " + "di-nucleosome fragments. (Default: %(default)s)", + metavar="INT", + default=0, + type=int, + required=False, + ) + + group.add_argument( + "--maxFragmentLength", + help="The maximum fragment length needed for read/pair " + "inclusion. (Default: %(default)s)", + metavar="INT", + default=0, + type=int, + required=False, + ) return parser @@ -146,144 +173,148 @@ def gtf_options(suppress=False): group = parser else: parser = argparse.ArgumentParser(add_help=False) - group = parser.add_argument_group('GTF/BED12 options') + group = parser.add_argument_group("GTF/BED12 options") if suppress: help = argparse.SUPPRESS else: - help = 'When either a BED12 or GTF file are used to provide \ + help = "When either a BED12 or GTF file are used to provide \ regions, perform the computation on the merged exons, \ rather than using the genomic interval defined by the \ 5-prime and 3-prime most transcript bound (i.e., columns \ 2 and 3 of a BED file). If a BED3 or BED6 file is used \ - as input, then columns 2 and 3 are used as an exon. (Default: %(default)s)' + as input, then columns 2 and 3 are used as an exon. (Default: %(default)s)" - group.add_argument('--metagene', - help=help, - action='store_true', - dest='keepExons') + group.add_argument("--metagene", help=help, action="store_true", dest="keepExons") if suppress is False: - help = 'When a GTF file is used to provide regions, only \ + help = "When a GTF file is used to provide regions, only \ entries with this value as their feature (column 3) \ - will be processed as transcripts. (Default: %(default)s)' + will be processed as transcripts. (Default: %(default)s)" - group.add_argument('--transcriptID', - help=help, - default='transcript') + group.add_argument("--transcriptID", help=help, default="transcript") if suppress is False: - help = 'When a GTF file is used to provide regions, only \ + help = "When a GTF file is used to provide regions, only \ entries with this value as their feature (column 3) \ will be processed as exons. CDS would be another common \ - value for this. (Default: %(default)s)' + value for this. (Default: %(default)s)" - group.add_argument('--exonID', - help=help, - default='exon') + group.add_argument("--exonID", help=help, default="exon") if suppress is False: - help = 'Each region has an ID (e.g., ACTB) assigned to it, \ + help = "Each region has an ID (e.g., ACTB) assigned to it, \ which for BED files is either column 4 (if it exists) \ or the interval bounds. For GTF files this is instead \ stored in the last column as a key:value pair (e.g., as \ - \'transcript_id "ACTB"\', for a key of transcript_id \ + 'transcript_id \"ACTB\"', for a key of transcript_id \ and a value of ACTB). In some cases it can be \ convenient to use a different identifier. To do so, set \ - this to the desired key. (Default: %(default)s)' + this to the desired key. (Default: %(default)s)" - group.add_argument('--transcript_id_designator', - help=help, - default='transcript_id') + group.add_argument("--transcript_id_designator", help=help, default="transcript_id") return parser def normalization_options(): - """Common arguments related to read coverage normalization - """ + """Common arguments related to read coverage normalization""" parser = argparse.ArgumentParser(add_help=False) - group = parser.add_argument_group('Read coverage normalization options') - - group.add_argument('--effectiveGenomeSize', - help='The effective genome size is the portion ' - 'of the genome that is mappable. Large fractions of ' - 'the genome are stretches of NNNN that should be ' - 'discarded. Also, if repetitive regions were not ' - 'included in the mapping of reads, the effective ' - 'genome size needs to be adjusted accordingly. ' - 'A table of values is available here: ' - 'http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .', - default=None, - type=int, - required=False) - - group.add_argument('--normalizeUsing', - help='Use one of the entered methods to ' - 'normalize the number of reads per bin. By default, no normalization is performed. ' - 'RPKM = Reads Per Kilobase per Million mapped reads; ' - 'CPM = Counts Per Million mapped reads, same as CPM in RNA-seq; ' - 'BPM = Bins Per Million mapped reads, same as TPM in RNA-seq; ' - 'RPGC = reads per genomic content (1x normalization); ' - 'Mapped reads are considered after blacklist filtering (if applied). ' - 'RPKM (per bin) = number of reads per bin / ' - '(number of mapped reads (in millions) * bin length (kb)). ' - 'CPM (per bin) = number of reads per bin / ' - 'number of mapped reads (in millions). ' - 'BPM (per bin) = number of reads per bin / ' - 'sum of all reads per bin (in millions). ' - 'RPGC (per bin) = number of reads per bin / ' - 'scaling factor for 1x average coverage. ' - 'None = the default and equivalent to not setting this option at all. ' - 'This scaling factor, in turn, is determined from the ' - 'sequencing depth: (total number of mapped reads * fragment length) / ' - 'effective genome size.\nThe scaling factor used ' - 'is the inverse of the sequencing depth computed ' - 'for the sample to match the 1x coverage. This option requires --effectiveGenomeSize. ' - 'Each read is considered independently, ' - 'if you want to only count one mate from a pair in ' - 'paired-end data, then use the --samFlagInclude/--samFlagExclude options. (Default: %(default)s)', - choices=['RPKM', 'CPM', 'BPM', 'RPGC', 'None'], - default=None, - required=False) - - group.add_argument('--exactScaling', - help='Instead of computing scaling factors based on a sampling of the reads, ' - 'process all of the reads to determine the exact number that will be used in ' - 'the output. This requires significantly more time to compute, but will ' - 'produce more accurate scaling factors in cases where alignments that are ' - 'being filtered are rare and lumped together. In other words, this is only ' - 'needed when region-based sampling is expected to produce incorrect results.', - action='store_true') - - group.add_argument('--ignoreForNormalization', '-ignore', - help='A list of space-delimited chromosome names ' - 'containing those chromosomes that should be excluded ' - 'for computing the normalization. This is useful when considering ' - 'samples with unequal coverage across chromosomes, like male ' - 'samples. An usage examples is --ignoreForNormalization chrX chrM.', - nargs='+') - - group.add_argument('--skipNonCoveredRegions', '--skipNAs', - help='This parameter determines if non-covered regions ' - '(regions without overlapping reads) in a BAM file should ' - 'be skipped. The default is to treat those regions as having a value of zero. ' - 'The decision to skip non-covered regions ' - 'depends on the interpretation of the data. Non-covered regions ' - 'may represent, for example, repetitive regions that should be skipped.', - action='store_true') - - group.add_argument('--smoothLength', - metavar="INT bp", - help='The smooth length defines a window, larger than ' - 'the binSize, to average the number of reads. For ' - 'example, if the --binSize is set to 20 and the ' - '--smoothLength is set to 60, then, for each ' - 'bin, the average of the bin and its left and right ' - 'neighbors is considered. Any value smaller than ' - '--binSize will be ignored and no smoothing will be ' - 'applied.', - type=int) + group = parser.add_argument_group("Read coverage normalization options") + + group.add_argument( + "--effectiveGenomeSize", + help="The effective genome size is the portion " + "of the genome that is mappable. Large fractions of " + "the genome are stretches of NNNN that should be " + "discarded. Also, if repetitive regions were not " + "included in the mapping of reads, the effective " + "genome size needs to be adjusted accordingly. " + "A table of values is available here: " + "http://deeptools.readthedocs.io/en/latest/content/feature/effectiveGenomeSize.html .", + default=None, + type=int, + required=False, + ) + + group.add_argument( + "--normalizeUsing", + help="Use one of the entered methods to " + "normalize the number of reads per bin. By default, no normalization is performed. " + "RPKM = Reads Per Kilobase per Million mapped reads; " + "CPM = Counts Per Million mapped reads, same as CPM in RNA-seq; " + "BPM = Bins Per Million mapped reads, same as TPM in RNA-seq; " + "RPGC = reads per genomic content (1x normalization); " + "Mapped reads are considered after blacklist filtering (if applied). " + "RPKM (per bin) = number of reads per bin / " + "(number of mapped reads (in millions) * bin length (kb)). " + "CPM (per bin) = number of reads per bin / " + "number of mapped reads (in millions). " + "BPM (per bin) = number of reads per bin / " + "sum of all reads per bin (in millions). " + "RPGC (per bin) = number of reads per bin / " + "scaling factor for 1x average coverage. " + "None = the default and equivalent to not setting this option at all. " + "This scaling factor, in turn, is determined from the " + "sequencing depth: (total number of mapped reads * fragment length) / " + "effective genome size.\nThe scaling factor used " + "is the inverse of the sequencing depth computed " + "for the sample to match the 1x coverage. This option requires --effectiveGenomeSize. " + "Each read is considered independently, " + "if you want to only count one mate from a pair in " + "paired-end data, then use the --samFlagInclude/--samFlagExclude options. (Default: %(default)s)", + choices=["RPKM", "CPM", "BPM", "RPGC", "None"], + default=None, + required=False, + ) + + group.add_argument( + "--exactScaling", + help="Instead of computing scaling factors based on a sampling of the reads, " + "process all of the reads to determine the exact number that will be used in " + "the output. This requires significantly more time to compute, but will " + "produce more accurate scaling factors in cases where alignments that are " + "being filtered are rare and lumped together. In other words, this is only " + "needed when region-based sampling is expected to produce incorrect results.", + action="store_true", + ) + + group.add_argument( + "--ignoreForNormalization", + "-ignore", + help="A list of space-delimited chromosome names " + "containing those chromosomes that should be excluded " + "for computing the normalization. This is useful when considering " + "samples with unequal coverage across chromosomes, like male " + "samples. An usage examples is --ignoreForNormalization chrX chrM.", + nargs="+", + ) + + group.add_argument( + "--skipNonCoveredRegions", + "--skipNAs", + help="This parameter determines if non-covered regions " + "(regions without overlapping reads) in a BAM file should " + "be skipped. The default is to treat those regions as having a value of zero. " + "The decision to skip non-covered regions " + "depends on the interpretation of the data. Non-covered regions " + "may represent, for example, repetitive regions that should be skipped.", + action="store_true", + ) + + group.add_argument( + "--smoothLength", + metavar="INT bp", + help="The smooth length defines a window, larger than " + "the binSize, to average the number of reads. For " + "example, if the --binSize is set to 20 and the " + "--smoothLength is set to 60, then, for each " + "bin, the average of the bin and its left and right " + "neighbors is considered. Any value smaller than " + "--binSize will be ignored and no smoothing will be " + "applied.", + type=int, + ) return parser @@ -294,54 +325,68 @@ def getParentArgParse(args=None, binSize=True, blackList=True): """ parser = argparse.ArgumentParser(add_help=False) - optional = parser.add_argument_group('Optional arguments') + optional = parser.add_argument_group("Optional arguments") - optional.add_argument('--version', action='version', - version='%(prog)s {}'.format(__version__)) + optional.add_argument( + "--version", action="version", version="%(prog)s {}".format(__version__) + ) if binSize: - optional.add_argument('--binSize', '-bs', - help='Size of the bins, in bases, for the output ' - 'of the bigwig/bedgraph file. (Default: %(default)s)', - metavar="INT bp", - type=int, - default=50) - - optional.add_argument('--region', '-r', - help='Region of the genome to limit the operation ' - 'to - this is useful when testing parameters to ' - 'reduce the computing time. The format is ' - 'chr:start:end, for example --region chr10 or ' - '--region chr10:456700:891000.', - metavar="CHR:START:END", - required=False, - type=genomicRegion) + optional.add_argument( + "--binSize", + "-bs", + help="Size of the bins, in bases, for the output " + "of the bigwig/bedgraph file. (Default: %(default)s)", + metavar="INT bp", + type=int, + default=50, + ) + + optional.add_argument( + "--region", + "-r", + help="Region of the genome to limit the operation " + "to - this is useful when testing parameters to " + "reduce the computing time. The format is " + "chr:start:end, for example --region chr10 or " + "--region chr10:456700:891000.", + metavar="CHR:START:END", + required=False, + type=genomicRegion, + ) if blackList: - optional.add_argument('--blackListFileName', '-bl', - help="A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.", - metavar="BED file", - nargs="+", - required=False) - - optional.add_argument('--numberOfProcessors', '-p', - help='Number of processors to use. Type "max/2" to ' - 'use half the maximum number of processors or "max" ' - 'to use all available processors. (Default: %(default)s)', - metavar="INT", - type=numberOfProcessors, - default=1, - required=False) - - optional.add_argument('--verbose', '-v', - help='Set to see processing messages.', - action='store_true') + optional.add_argument( + "--blackListFileName", + "-bl", + help="A BED or GTF file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered. Please note that you should adjust the effective genome size, if relevant.", + metavar="BED file", + nargs="+", + required=False, + ) + + optional.add_argument( + "--numberOfProcessors", + "-p", + help='Number of processors to use. Type "max/2" to ' + 'use half the maximum number of processors or "max" ' + "to use all available processors. (Default: %(default)s)", + metavar="INT", + type=numberOfProcessors, + default=1, + required=False, + ) + + optional.add_argument( + "--verbose", "-v", help="Set to see processing messages.", action="store_true" + ) return parser def numberOfProcessors(string): import multiprocessing + availProc = multiprocessing.cpu_count() if string == "max/2": # default case @@ -355,13 +400,16 @@ def numberOfProcessors(string): numberOfProcessors = int(string) except ValueError: raise argparse.ArgumentTypeError( - "{} is not a valid number of processors".format(string)) + "{} is not a valid number of processors".format(string) + ) except Exception as e: - raise argparse.ArgumentTypeError("the given value {} is not valid. " - "Error message: {}\nThe number of " - "available processors in your " - "computer is {}.".format(string, e, availProc)) + raise argparse.ArgumentTypeError( + "the given value {} is not valid. " + "Error message: {}\nThe number of " + "available processors in your " + "computer is {}.".format(string, e, availProc) + ) if numberOfProcessors > availProc: numberOfProcessors = availProc @@ -371,8 +419,8 @@ def numberOfProcessors(string): def genomicRegion(string): # remove whitespaces using split,join trick - region = ''.join(string.split()) - if region == '': + region = "".join(string.split()) + if region == "": return None # remove undesired characters that may be present and # replace - by : @@ -382,8 +430,7 @@ def genomicRegion(string): except: region = region.translate({ord(i): None for i in ",;|!{}()"}) if len(region) == 0: - raise argparse.ArgumentTypeError( - "{} is not a valid region".format(string)) + raise argparse.ArgumentTypeError("{} is not a valid region".format(string)) return region @@ -392,7 +439,7 @@ def writableFile(string): Simple function that tests if a given path is writable """ try: - open(string, 'w').close() + open(string, "w").close() os.remove(string) except: msg = "{} file can't be opened for writing".format(string) @@ -407,494 +454,584 @@ def writableFile(string): def heatmapperMatrixArgs(args=None): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') - required.add_argument('--matrixFile', '-m', - help='Matrix file from the computeMatrix tool.', - type=argparse.FileType('r'), - ) - - required.add_argument('--outFileName', '-out', '-o', - help='File name to save the image to. The file ' - 'ending will be used to determine the image ' - 'format. The available options are: "png", ' - '"eps", "pdf" and "svg", e.g., MyHeatmap.png.', - type=writableFile, - required=True) + required = parser.add_argument_group("Required arguments") + required.add_argument( + "--matrixFile", + "-m", + help="Matrix file from the computeMatrix tool.", + type=argparse.FileType("r"), + ) + + required.add_argument( + "--outFileName", + "-out", + "-o", + help="File name to save the image to. The file " + "ending will be used to determine the image " + 'format. The available options are: "png", ' + '"eps", "pdf" and "svg", e.g., MyHeatmap.png.', + type=writableFile, + required=True, + ) return parser -def heatmapperOutputArgs(args=None, - mode=['heatmap', 'profile'][0]): +def heatmapperOutputArgs(args=None, mode=["heatmap", "profile"][0]): parser = argparse.ArgumentParser(add_help=False) - output = parser.add_argument_group('Output options') + output = parser.add_argument_group("Output options") output.add_argument( - '--outFileSortedRegions', - help='File name into which the regions are saved ' - 'after skipping zeros or min/max threshold values. The ' - 'order of the regions in the file follows the sorting ' - 'order selected. This is useful, for example, to ' - 'generate other heatmaps while keeping the sorting of the ' - 'first heatmap. Example: Heatmap1sortedRegions.bed', - metavar='FILE', - type=argparse.FileType('w')) - - if mode == 'heatmap': - output.add_argument('--outFileNameMatrix', - help='If this option is given, then the matrix ' - 'of values underlying the heatmap will be saved ' - 'using this name, e.g. MyMatrix.gz.', - metavar='FILE', - type=writableFile) - - output.add_argument('--interpolationMethod', - help='If the heatmap image contains a large number of columns ' - 'is usually better to use an interpolation method to produce ' - 'better results (see ' - 'https://matplotlib.org/examples/images_contours_and_fields/interpolation_methods.html). ' - 'Be default, plotHeatmap uses the method `nearest` if the number of columns is 1000 or ' - 'less. Otherwise it uses the bilinear method. This default behaviour can be changed by ' - 'using any of the following options: "nearest", "bilinear", "bicubic", ' - '"gaussian"', - choices=['auto', 'nearest', 'bilinear', 'bicubic', 'gaussian'], - metavar='STR', - default='auto') - elif mode == 'profile': - output.add_argument('--outFileNameData', - help='File name to save the data ' - 'underlying data for the average profile, e.g. ' - 'myProfile.tab.', - type=writableFile) + "--outFileSortedRegions", + help="File name into which the regions are saved " + "after skipping zeros or min/max threshold values. The " + "order of the regions in the file follows the sorting " + "order selected. This is useful, for example, to " + "generate other heatmaps while keeping the sorting of the " + "first heatmap. Example: Heatmap1sortedRegions.bed", + metavar="FILE", + type=argparse.FileType("w"), + ) + + if mode == "heatmap": + output.add_argument( + "--outFileNameMatrix", + help="If this option is given, then the matrix " + "of values underlying the heatmap will be saved " + "using this name, e.g. MyMatrix.gz.", + metavar="FILE", + type=writableFile, + ) + + output.add_argument( + "--interpolationMethod", + help="If the heatmap image contains a large number of columns " + "is usually better to use an interpolation method to produce " + "better results (see " + "https://matplotlib.org/examples/images_contours_and_fields/interpolation_methods.html). " + "Be default, plotHeatmap uses the method `nearest` if the number of columns is 1000 or " + "less. Otherwise it uses the bilinear method. This default behaviour can be changed by " + 'using any of the following options: "nearest", "bilinear", "bicubic", ' + '"gaussian"', + choices=["auto", "nearest", "bilinear", "bicubic", "gaussian"], + metavar="STR", + default="auto", + ) + elif mode == "profile": + output.add_argument( + "--outFileNameData", + help="File name to save the data " + "underlying data for the average profile, e.g. " + "myProfile.tab.", + type=writableFile, + ) output.add_argument( - '--dpi', - help='Set the DPI to save the figure.', - type=int, - default=200) + "--dpi", help="Set the DPI to save the figure.", type=int, default=200 + ) return parser -def heatmapperOptionalArgs(mode=['heatmap', 'profile'][0]): - +def heatmapperOptionalArgs(mode=["heatmap", "profile"][0]): parser = argparse.ArgumentParser(add_help=False) - cluster = parser.add_argument_group('Clustering arguments') + cluster = parser.add_argument_group("Clustering arguments") cluster.add_argument( - '--kmeans', - help='Number of clusters to compute. When this ' - 'option is set, the matrix is split into clusters ' - 'using the k-means algorithm. Only works for data that ' - 'is not grouped, otherwise only the first group will ' - 'be clustered. If more specific clustering methods ' - 'are required, then save the underlying matrix ' - 'and run the clustering using other software. The plotting ' - 'of the clustering may fail with an error if a ' - 'cluster has very few members compared to the total number ' - 'or regions.', - type=int) + "--kmeans", + help="Number of clusters to compute. When this " + "option is set, the matrix is split into clusters " + "using the k-means algorithm. Only works for data that " + "is not grouped, otherwise only the first group will " + "be clustered. If more specific clustering methods " + "are required, then save the underlying matrix " + "and run the clustering using other software. The plotting " + "of the clustering may fail with an error if a " + "cluster has very few members compared to the total number " + "or regions.", + type=int, + ) cluster.add_argument( - '--hclust', - help='Number of clusters to compute. When this ' - 'option is set, then the matrix is split into clusters ' + "--hclust", + help="Number of clusters to compute. When this " + "option is set, then the matrix is split into clusters " 'using the hierarchical clustering algorithm, using "ward linkage". ' - 'Only works for data that is not grouped, otherwise only the first ' - 'group will be clustered. --hclust could be very slow if you have ' - '>1000 regions. In those cases, you might prefer --kmeans or if more ' - 'clustering methods are required you can save the underlying matrix and run ' - 'the clustering using other software. The plotting of the clustering may ' - 'fail with an error if a cluster has very few members compared to the ' - 'total number of regions.', - type=int) + "Only works for data that is not grouped, otherwise only the first " + "group will be clustered. --hclust could be very slow if you have " + ">1000 regions. In those cases, you might prefer --kmeans or if more " + "clustering methods are required you can save the underlying matrix and run " + "the clustering using other software. The plotting of the clustering may " + "fail with an error if a cluster has very few members compared to the " + "total number of regions.", + type=int, + ) cluster.add_argument( - '--silhouette', - help='Compute the silhouette score for regions. This is only' - ' applicable if clustering has been performed. The silhouette score' - ' is a measure of how similar a region is to other regions in the' - ' same cluster as opposed to those in other clusters. It will be reported' - ' in the final column of the BED file with regions. The ' - 'silhouette evaluation can be very slow when you have more' - 'than 100 000 regions.', - action='store_true' - ) - - optional = parser.add_argument_group('Optional arguments') - - optional.add_argument("--help", "-h", action="help", - help="show this help message and exit") - optional.add_argument('--version', action='version', - version='%(prog)s {}'.format(__version__)) - if mode == 'profile': + "--silhouette", + help="Compute the silhouette score for regions. This is only" + " applicable if clustering has been performed. The silhouette score" + " is a measure of how similar a region is to other regions in the" + " same cluster as opposed to those in other clusters. It will be reported" + " in the final column of the BED file with regions. The " + "silhouette evaluation can be very slow when you have more" + "than 100 000 regions.", + action="store_true", + ) + + optional = parser.add_argument_group("Optional arguments") + + optional.add_argument( + "--help", "-h", action="help", help="show this help message and exit" + ) + optional.add_argument( + "--version", action="version", version="%(prog)s {}".format(__version__) + ) + if mode == "profile": optional.add_argument( - '--averageType', - default='mean', + "--averageType", + default="mean", choices=["mean", "median", "min", "max", "std", "sum"], - help='The type of statistic that should be used for the ' + help="The type of statistic that should be used for the " 'profile. The options are: "mean", "median", "min", "max", ' - '"sum" and "std".') + '"sum" and "std".', + ) - optional.add_argument('--plotHeight', - help='Plot height in cm.', - type=float, - default=7) + optional.add_argument( + "--plotHeight", help="Plot height in cm.", type=float, default=7 + ) - optional.add_argument('--plotWidth', - help='Plot width in cm. The minimum value is 1 cm.', - type=float, - default=11) + optional.add_argument( + "--plotWidth", + help="Plot width in cm. The minimum value is 1 cm.", + type=float, + default=11, + ) optional.add_argument( - '--plotType', + "--plotType", help='"lines" will plot the profile line based ' 'on the average type selected. "fill" ' - 'fills the region between zero and the profile ' - 'curve. The fill in color is semi transparent to ' + "fills the region between zero and the profile " + "curve. The fill in color is semi transparent to " 'distinguish different profiles. "se" and "std" ' - 'color the region between the profile and the ' - 'standard error or standard deviation of the data. ' - 'As in the case of ' - 'fill, a semi-transparent color is used. ' + "color the region between the profile and the " + "standard error or standard deviation of the data. " + "As in the case of " + "fill, a semi-transparent color is used. " '"overlapped_lines" plots each region\'s value, one on ' 'top of the other. "heatmap" plots a ' - 'summary heatmap.', - choices=['lines', 'fill', 'se', 'std', 'overlapped_lines', 'heatmap'], - default='lines') - - optional.add_argument('--colors', - help='List of colors to use ' - 'for the plotted lines (N.B., not applicable to \'--plotType overlapped_lines\'). Color names ' - 'and html hex strings (e.g., #eeff22) ' - 'are accepted. The color names should ' - 'be space separated. For example, ' - '--colors red blue green ', - nargs='+') - - optional.add_argument('--numPlotsPerRow', - help='Number of plots per row', - type=int, - default=8) - - optional.add_argument('--clusterUsingSamples', - help='List of sample numbers (order as in ' - 'matrix), that are used for clustering by ' - '--kmeans or --hclust if not given, all samples ' - 'are taken into account for clustering. ' - 'Example: --ClusterUsingSamples 1 3', - type=int, nargs='+') - - elif mode == 'heatmap': + "summary heatmap.", + choices=["lines", "fill", "se", "std", "overlapped_lines", "heatmap"], + default="lines", + ) + + optional.add_argument( + "--colors", + help="List of colors to use " + "for the plotted lines (N.B., not applicable to '--plotType overlapped_lines'). Color names " + "and html hex strings (e.g., #eeff22) " + "are accepted. The color names should " + "be space separated. For example, " + "--colors red blue green ", + nargs="+", + ) + optional.add_argument( - '--plotType', + "--numPlotsPerRow", help="Number of plots per row", type=int, default=8 + ) + + optional.add_argument( + "--clusterUsingSamples", + help="List of sample numbers (order as in " + "matrix), that are used for clustering by " + "--kmeans or --hclust if not given, all samples " + "are taken into account for clustering. " + "Example: --ClusterUsingSamples 1 3", + type=int, + nargs="+", + ) + + elif mode == "heatmap": + optional.add_argument( + "--plotType", help='"lines" will plot the profile line based ' 'on the average type selected. "fill" ' - 'fills the region between zero and the profile ' - 'curve. The fill in color is semi transparent to ' + "fills the region between zero and the profile " + "curve. The fill in color is semi transparent to " 'distinguish different profiles. "se" and "std" ' - 'color the region between the profile and the ' - 'standard error or standard deviation of the data.', - choices=['lines', 'fill', 'se', 'std'], - default='lines') - optional.add_argument('--sortRegions', - help='Whether the heatmap should present ' - 'the regions sorted. The default is ' - 'to sort in descending order based on ' - 'the mean value per region. Note that "keep" and "no" are the same thing.', - choices=["descend", "ascend", "no", "keep"], - default='descend') - - optional.add_argument('--sortUsing', - help='Indicate which method should be used for ' - 'sorting. For each row the method is computed. ' - 'For region_length, a dashed line is drawn at ' - 'the end of the region (reference point TSS and ' - 'center) or the beginning of the region ' - '(reference point TES) as appropriate.', - choices=["mean", "median", "max", "min", "sum", - "region_length"], - default='mean') - - optional.add_argument('--sortUsingSamples', - help='List of sample numbers (order as in matrix), ' - 'which are used by --sortUsing for sorting. ' - 'If no value is set, it uses all samples. ' - 'Example: --sortUsingSamples 1 3', - type=int, nargs='+') - - optional.add_argument('--linesAtTickMarks', - help='Draw dashed lines from all tick marks through the heatmap. ' - 'This is then similar to the dashed line draw at region bounds ' - 'when using a reference point and --sortUsing region_length', - action='store_true') - - optional.add_argument('--clusterUsingSamples', - help='List of sample numbers (order as in ' - 'matrix), that are used for clustering by ' - '--kmeans or --hclust if not given, all samples ' - 'are taken into account for clustering. ' - 'Example: --ClusterUsingSamples 1 3', - type=int, nargs='+') + "color the region between the profile and the " + "standard error or standard deviation of the data.", + choices=["lines", "fill", "se", "std"], + default="lines", + ) + optional.add_argument( + "--sortRegions", + help="Whether the heatmap should present " + "the regions sorted. The default is " + "to sort in descending order based on " + 'the mean value per region. Note that "keep" and "no" are the same thing.', + choices=["descend", "ascend", "no", "keep"], + default="descend", + ) + + optional.add_argument( + "--sortUsing", + help="Indicate which method should be used for " + "sorting. For each row the method is computed. " + "For region_length, a dashed line is drawn at " + "the end of the region (reference point TSS and " + "center) or the beginning of the region " + "(reference point TES) as appropriate.", + choices=["mean", "median", "max", "min", "sum", "region_length"], + default="mean", + ) optional.add_argument( - '--averageTypeSummaryPlot', - default='mean', - choices=["mean", "median", "min", - "max", "std", "sum"], - help='Define the type of statistic that should be plotted in the ' + "--sortUsingSamples", + help="List of sample numbers (order as in matrix), " + "which are used by --sortUsing for sorting. " + "If no value is set, it uses all samples. " + "Example: --sortUsingSamples 1 3", + type=int, + nargs="+", + ) + + optional.add_argument( + "--linesAtTickMarks", + help="Draw dashed lines from all tick marks through the heatmap. " + "This is then similar to the dashed line draw at region bounds " + "when using a reference point and --sortUsing region_length", + action="store_true", + ) + + optional.add_argument( + "--clusterUsingSamples", + help="List of sample numbers (order as in " + "matrix), that are used for clustering by " + "--kmeans or --hclust if not given, all samples " + "are taken into account for clustering. " + "Example: --ClusterUsingSamples 1 3", + type=int, + nargs="+", + ) + + optional.add_argument( + "--averageTypeSummaryPlot", + default="mean", + choices=["mean", "median", "min", "max", "std", "sum"], + help="Define the type of statistic that should be plotted in the " 'summary image above the heatmap. The options are: "mean", ' - '"median", "min", "max", "sum" and "std".') + '"median", "min", "max", "sum" and "std".', + ) optional.add_argument( - '--missingDataColor', - default='black', - help='If --missingDataAsZero was not set, such cases ' - 'will be colored in black by default. Using this ' - 'parameter, a different color can be set. A value ' - 'between 0 and 1 will be used for a gray scale ' - '(black is 0). For a list of possible color ' - 'names see: http://packages.python.org/ete2/' - 'reference/reference_svgcolors.html. ' - 'Other colors can be specified using the #rrggbb ' - 'notation.') + "--missingDataColor", + default="black", + help="If --missingDataAsZero was not set, such cases " + "will be colored in black by default. Using this " + "parameter, a different color can be set. A value " + "between 0 and 1 will be used for a gray scale " + "(black is 0). For a list of possible color " + "names see: http://packages.python.org/ete2/" + "reference/reference_svgcolors.html. " + "Other colors can be specified using the #rrggbb " + "notation.", + ) import matplotlib.pyplot as plt - color_options = "', '".join([x for x in plt.colormaps() if not x.endswith('_r')]) + + color_options = "', '".join( + [x for x in plt.colormaps() if not x.endswith("_r")] + ) optional.add_argument( - '--colorMap', - help='Color map to use for the heatmap. If more than one heatmap is being plotted the color ' - 'of each heatmap can be enter individually (e.g. `--colorMap Reds Blues`). Color maps ' - 'are recycled if the number of color maps is smaller than the number of heatmaps being ' - 'plotted. Available values can be seen here: http://matplotlib.org/users/colormaps.html ' - 'The available options are: \'' + color_options + '\'', - default=['RdYlBu'], - nargs='+') + "--colorMap", + help="Color map to use for the heatmap. If more than one heatmap is being plotted the color " + "of each heatmap can be enter individually (e.g. `--colorMap Reds Blues`). Color maps " + "are recycled if the number of color maps is smaller than the number of heatmaps being " + "plotted. Available values can be seen here: http://matplotlib.org/users/colormaps.html " + "The available options are: '" + color_options + "'", + default=["RdYlBu"], + nargs="+", + ) optional.add_argument( - '--alpha', + "--alpha", default=1.0, type=check_float_0_1, - help='The alpha channel (transparency) to use for the heatmaps. The default is 1.0 and values ' - 'must be between 0 and 1.') + help="The alpha channel (transparency) to use for the heatmaps. The default is 1.0 and values " + "must be between 0 and 1.", + ) optional.add_argument( - '--colorList', - help='List of colors to use to create a colormap. For example, if `--colorList black,yellow,blue` ' - 'is set (colors separated by comas) then a color map that starts with black, continues to ' - 'yellow and finishes in blue is created. If this option is selected, it overrides the --colorMap ' - 'chosen. The list of valid color names can be seen here: ' - 'http://matplotlib.org/examples/color/named_colors.html ' - 'Hex colors are valid (e.g #34a2b1). If individual colors for different heatmaps ' - 'need to be specified they need to be separated by space as for example: ' - '`--colorList "white,#cccccc" "white,darkred"` ' - 'As for --colorMap, the color lists are recycled if their number is smaller thatn the number of' - 'plotted heatmaps. ' - 'The number of transitions is defined by the --colorNumber option.', + "--colorList", + help="List of colors to use to create a colormap. For example, if `--colorList black,yellow,blue` " + "is set (colors separated by comas) then a color map that starts with black, continues to " + "yellow and finishes in blue is created. If this option is selected, it overrides the --colorMap " + "chosen. The list of valid color names can be seen here: " + "http://matplotlib.org/examples/color/named_colors.html " + "Hex colors are valid (e.g #34a2b1). If individual colors for different heatmaps " + "need to be specified they need to be separated by space as for example: " + '`--colorList "white,#cccccc" "white,darkred"` ' + "As for --colorMap, the color lists are recycled if their number is smaller thatn the number of" + "plotted heatmaps. " + "The number of transitions is defined by the --colorNumber option.", type=check_list_of_comma_values, - nargs='+') + nargs="+", + ) optional.add_argument( - '--colorNumber', - help='N.B., --colorList is required for an effect. This controls the ' - 'number of transitions from one color to the other. If --colorNumber is ' - 'the number of colors in --colorList then there will be no transitions ' - 'between the colors.', + "--colorNumber", + help="N.B., --colorList is required for an effect. This controls the " + "number of transitions from one color to the other. If --colorNumber is " + "the number of colors in --colorList then there will be no transitions " + "between the colors.", type=int, - default=256) - - optional.add_argument('--zMin', '-min', - default=None, - help='Minimum value for the heatmap intensities. Multiple values, separated by ' - 'spaces can be set for each heatmap. If the number of zMin values is smaller than' - 'the number of heatmaps the values are recycled. If a value is set to "auto", it will be set ' - ' to the first percentile of the matrix values.', - type=str, - nargs='+') - optional.add_argument('--zMax', '-max', - default=None, - help='Maximum value for the heatmap intensities. Multiple values, separated by ' - 'spaces can be set for each heatmap. If the number of zMax values is smaller than' - 'the number of heatmaps the values are recycled. If a value is set to "auto", it will be set ' - ' to the 98th percentile of the matrix values.', - type=str, - nargs='+') - optional.add_argument('--heatmapHeight', - help='Plot height in cm. The default for the heatmap ' - 'height is 28. The minimum value is ' - '3 and the maximum is 100.', - type=float, - default=28) - - optional.add_argument('--heatmapWidth', - help='Plot width in cm. The default value is 4 ' - 'The minimum value is 1 and the ' - 'maximum is 100.', - type=float, - default=4) + default=256, + ) + + optional.add_argument( + "--zMin", + "-min", + default=None, + help="Minimum value for the heatmap intensities. Multiple values, separated by " + "spaces can be set for each heatmap. If the number of zMin values is smaller than" + 'the number of heatmaps the values are recycled. If a value is set to "auto", it will be set ' + " to the first percentile of the matrix values.", + type=str, + nargs="+", + ) + optional.add_argument( + "--zMax", + "-max", + default=None, + help="Maximum value for the heatmap intensities. Multiple values, separated by " + "spaces can be set for each heatmap. If the number of zMax values is smaller than" + 'the number of heatmaps the values are recycled. If a value is set to "auto", it will be set ' + " to the 98th percentile of the matrix values.", + type=str, + nargs="+", + ) optional.add_argument( - '--whatToShow', - help='The default is to include a summary or profile plot on top ' - 'of the heatmap and a heatmap colorbar. Other options are: ' + "--heatmapHeight", + help="Plot height in cm. The default for the heatmap " + "height is 28. The minimum value is " + "3 and the maximum is 100.", + type=float, + default=28, + ) + + optional.add_argument( + "--heatmapWidth", + help="Plot width in cm. The default value is 4 " + "The minimum value is 1 and the " + "maximum is 100.", + type=float, + default=4, + ) + optional.add_argument( + "--whatToShow", + help="The default is to include a summary or profile plot on top " + "of the heatmap and a heatmap colorbar. Other options are: " '"plot and heatmap", "heatmap only", "heatmap and ' 'colorbar", and the default "plot, heatmap and ' 'colorbar".', - choices=["plot, heatmap and colorbar", - "plot and heatmap", "heatmap only", - "heatmap and colorbar"], - default='plot, heatmap and colorbar') + choices=[ + "plot, heatmap and colorbar", + "plot and heatmap", + "heatmap only", + "heatmap and colorbar", + ], + default="plot, heatmap and colorbar", + ) optional.add_argument( - '--boxAroundHeatmaps', - help='By default black boxes are plot around heatmaps. This can be turned off ' - 'by setting --boxAroundHeatmaps no', - default='yes') + "--boxAroundHeatmaps", + help="By default black boxes are plot around heatmaps. This can be turned off " + "by setting --boxAroundHeatmaps no", + default="yes", + ) - optional.add_argument('--xAxisLabel', '-x', - default='gene distance (bp)', - help='Description for the x-axis label.') + optional.add_argument( + "--xAxisLabel", + "-x", + default="gene distance (bp)", + help="Description for the x-axis label.", + ) # end elif - optional.add_argument('--startLabel', - default='TSS', - help='[only for scale-regions mode] Label shown ' - 'in the plot for the start of ' - 'the region. Default is TSS (transcription ' - 'start site), but could be changed to anything, ' - 'e.g. "peak start". ' - 'Same for the --endLabel option. See below.') - optional.add_argument('--endLabel', - default='TES', - help='[only for scale-regions mode] Label ' - 'shown in the plot for the region ' - 'end. Default is TES (transcription end site).') - optional.add_argument('--refPointLabel', - help='[only for reference-point mode] Label ' - 'shown in the plot for the ' - 'reference-point. Default ' - 'is the same as the reference point selected ' - '(e.g. TSS), but could be anything, e.g. ' - '"peak start".', - default=None) - - optional.add_argument('--labelRotation', - dest='label_rotation', - help='Rotation of the X-axis labels in degrees. The default is 0, positive values denote a counter-clockwise rotation.', - type=float, - default=0.0) - - optional.add_argument('--nanAfterEnd', - help=argparse.SUPPRESS, - default=False) - - optional.add_argument('--regionsLabel', '-z', - help='Labels for the regions plotted in the ' - 'heatmap. If more than one region is being ' - 'plotted, a list of labels separated by spaces is required. ' - 'If a label itself contains a space, then quotes are ' - 'needed. For example, --regionsLabel label_1, "label 2". ', - nargs='+') - - optional.add_argument('--samplesLabel', - help='Labels for the samples plotted. The ' - 'default is to use the file name of the ' - 'sample. The sample labels should be separated ' - 'by spaces and quoted if a label itself' - 'contains a space E.g. --samplesLabel label-1 "label 2" ', - nargs='+') - - optional.add_argument('--plotTitle', '-T', - help='Title of the plot, to be printed on top of ' - 'the generated image. Leave blank for no title.', - default='') - - optional.add_argument('--yAxisLabel', '-y', - default='', - help='Y-axis label for the top panel.') - - optional.add_argument('--yMin', - default=None, - nargs='+', - help='Minimum value for the Y-axis. Multiple values, separated by ' - 'spaces can be set for each profile. If the number of yMin values is smaller than' - 'the number of plots, the values are recycled.') - optional.add_argument('--yMax', - default=None, - nargs='+', - help='Maximum value for the Y-axis. Multiple values, separated by ' - 'spaces can be set for each profile. If the number of yMin values is smaller than' - 'the number of plots, the values are recycled.') - - optional.add_argument('--legendLocation', - default='best', - choices=['best', - 'upper-right', - 'upper-left', - 'upper-center', - 'lower-left', - 'lower-right', - 'lower-center', - 'center', - 'center-left', - 'center-right', - 'none' - ], - help='Location for the legend in the summary plot. ' - 'Note that "none" does not work for the profiler.') - - optional.add_argument('--perGroup', - help='The default is to plot all groups of regions by ' - 'sample. Using this option instead plots all samples by ' - 'group of regions. Note that this is only useful if you ' - 'have multiple groups of regions. by sample rather than ' - 'group.', - action='store_true') - - optional.add_argument('--plotFileFormat', - metavar='', - help='Image format type. If given, this ' - 'option overrides the ' - 'image format based on the plotFile ending. ' - 'The available options are: "png", ' - '"eps", "pdf", "plotly" and "svg"', - choices=['png', 'pdf', 'svg', 'eps', 'plotly']) - - optional.add_argument('--verbose', - help='If set, warning messages and ' - 'additional information are given.', - action='store_true') + optional.add_argument( + "--startLabel", + default="TSS", + help="[only for scale-regions mode] Label shown " + "in the plot for the start of " + "the region. Default is TSS (transcription " + "start site), but could be changed to anything, " + 'e.g. "peak start". ' + "Same for the --endLabel option. See below.", + ) + optional.add_argument( + "--endLabel", + default="TES", + help="[only for scale-regions mode] Label " + "shown in the plot for the region " + "end. Default is TES (transcription end site).", + ) + optional.add_argument( + "--refPointLabel", + help="[only for reference-point mode] Label " + "shown in the plot for the " + "reference-point. Default " + "is the same as the reference point selected " + "(e.g. TSS), but could be anything, e.g. " + '"peak start".', + default=None, + ) + + optional.add_argument( + "--labelRotation", + dest="label_rotation", + help="Rotation of the X-axis labels in degrees. The default is 0, positive values denote a counter-clockwise rotation.", + type=float, + default=0.0, + ) + + optional.add_argument("--nanAfterEnd", help=argparse.SUPPRESS, default=False) + + optional.add_argument( + "--regionsLabel", + "-z", + help="Labels for the regions plotted in the " + "heatmap. If more than one region is being " + "plotted, a list of labels separated by spaces is required. " + "If a label itself contains a space, then quotes are " + 'needed. For example, --regionsLabel label_1, "label 2". ', + nargs="+", + ) + + optional.add_argument( + "--samplesLabel", + help="Labels for the samples plotted. The " + "default is to use the file name of the " + "sample. The sample labels should be separated " + "by spaces and quoted if a label itself" + 'contains a space E.g. --samplesLabel label-1 "label 2" ', + nargs="+", + ) + + optional.add_argument( + "--plotTitle", + "-T", + help="Title of the plot, to be printed on top of " + "the generated image. Leave blank for no title.", + default="", + ) + + optional.add_argument( + "--yAxisLabel", "-y", default="", help="Y-axis label for the top panel." + ) + + optional.add_argument( + "--yMin", + default=None, + nargs="+", + help="Minimum value for the Y-axis. Multiple values, separated by " + "spaces can be set for each profile. If the number of yMin values is smaller than" + "the number of plots, the values are recycled.", + ) + optional.add_argument( + "--yMax", + default=None, + nargs="+", + help="Maximum value for the Y-axis. Multiple values, separated by " + "spaces can be set for each profile. If the number of yMin values is smaller than" + "the number of plots, the values are recycled.", + ) + + optional.add_argument( + "--legendLocation", + default="best", + choices=[ + "best", + "upper-right", + "upper-left", + "upper-center", + "lower-left", + "lower-right", + "lower-center", + "center", + "center-left", + "center-right", + "none", + ], + help="Location for the legend in the summary plot. " + 'Note that "none" does not work for the profiler.', + ) + + optional.add_argument( + "--perGroup", + help="The default is to plot all groups of regions by " + "sample. Using this option instead plots all samples by " + "group of regions. Note that this is only useful if you " + "have multiple groups of regions. by sample rather than " + "group.", + action="store_true", + ) + + optional.add_argument( + "--plotFileFormat", + metavar="", + help="Image format type. If given, this " + "option overrides the " + "image format based on the plotFile ending. " + 'The available options are: "png", ' + '"eps", "pdf", "plotly" and "svg"', + choices=["png", "pdf", "svg", "eps", "plotly"], + ) + + optional.add_argument( + "--verbose", + help="If set, warning messages and " "additional information are given.", + action="store_true", + ) return parser def deepBlueOptionalArgs(): - parser = argparse.ArgumentParser(add_help=False) - dbo = parser.add_argument_group('deepBlue arguments', 'Options used only for remote bedgraph/wig files hosted on deepBlue') + dbo = parser.add_argument_group( + "deepBlue arguments", + "Options used only for remote bedgraph/wig files hosted on deepBlue", + ) dbo.add_argument( - '--deepBlueURL', - help='For remote files bedgraph/wiggle files hosted on deepBlue, this ' - 'specifies the server URL. The default is ' - '"http://deepblue.mpi-inf.mpg.de/xmlrpc", which should not be ' - 'changed without good reason.', - default='http://deepblue.mpi-inf.mpg.de/xmlrpc') + "--deepBlueURL", + help="For remote files bedgraph/wiggle files hosted on deepBlue, this " + "specifies the server URL. The default is " + '"http://deepblue.mpi-inf.mpg.de/xmlrpc", which should not be ' + "changed without good reason.", + default="http://deepblue.mpi-inf.mpg.de/xmlrpc", + ) dbo.add_argument( - '--userKey', - help='For remote files bedgraph/wiggle files hosted on deepBlue, this ' - 'specifies the user key to use for access. The default is ' - '"anonymous_key", which suffices for public datasets. If you need ' - 'access to a restricted access/private dataset, then request a ' - 'key from deepBlue and specify it here.', - default='anonymous_key') + "--userKey", + help="For remote files bedgraph/wiggle files hosted on deepBlue, this " + "specifies the user key to use for access. The default is " + '"anonymous_key", which suffices for public datasets. If you need ' + "access to a restricted access/private dataset, then request a " + "key from deepBlue and specify it here.", + default="anonymous_key", + ) dbo.add_argument( - '--deepBlueTempDir', - help='If specified, temporary files from preloading datasets from ' - 'deepBlue will be written here (note, this directory must exist). ' - 'If not specified, where ever temporary files would normally be written ' - 'on your system is used.', - default=None) + "--deepBlueTempDir", + help="If specified, temporary files from preloading datasets from " + "deepBlue will be written here (note, this directory must exist). " + "If not specified, where ever temporary files would normally be written " + "on your system is used.", + default=None, + ) dbo.add_argument( - '--deepBlueKeepTemp', - action='store_true', - help='If specified, temporary bigWig files from preloading deepBlue ' - 'datasets are not deleted. A message will be printed noting where these ' - 'files are and what sample they correspond to. These can then be used ' - 'if you wish to analyse the same sample with the same regions again.') + "--deepBlueKeepTemp", + action="store_true", + help="If specified, temporary bigWig files from preloading deepBlue " + "datasets are not deleted. A message will be printed noting where these " + "files are and what sample they correspond to. These can then be used " + "if you wish to analyse the same sample with the same regions again.", + ) return parser @@ -906,11 +1043,15 @@ def requiredLength(minL, maxL): minL and maxL are the minimum and maximum length """ + # https://stackoverflow.com/questions/4194948/python-argparse-is-there-a-way-to-specify-a-range-in-nargs class RequiredLength(argparse.Action): def __call__(self, parser, args, values, option_string=None): if not minL <= len(values) <= maxL: - msg = 'argument "{}" requires between {} and {} arguments'.format(self.dest, minL, maxL) + msg = 'argument "{}" requires between {} and {} arguments'.format( + self.dest, minL, maxL + ) raise argparse.ArgumentTypeError(msg) setattr(args, self.dest, values) + return RequiredLength diff --git a/deeptools/plotCorrelation.py b/deeptools/plotCorrelation.py index a03839baa..84dbe322a 100644 --- a/deeptools/plotCorrelation.py +++ b/deeptools/plotCorrelation.py @@ -5,9 +5,10 @@ import argparse import numpy as np import matplotlib -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['svg.fonttype'] = 'none' + +matplotlib.use("Agg") +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["svg.fonttype"] = "none" from deeptools import cm # noqa: F401 import matplotlib.pyplot as plt @@ -15,7 +16,7 @@ from deeptools.parserCommon import writableFile from deeptools._version import __version__ -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def parse_arguments(args=None): @@ -38,93 +39,120 @@ def parse_arguments(args=None): plotCorrelation -h """, - epilog='example usages:\n' - 'plotCorrelation -in results_file --whatToPlot heatmap --corMethod pearson -o heatmap.png\n\n' - ' \n\n', - parents=[basic_args, heatmap_parser, scatter_parser]) + epilog="example usages:\n" + "plotCorrelation -in results_file --whatToPlot heatmap --corMethod pearson -o heatmap.png\n\n" + " \n\n", + parents=[basic_args, heatmap_parser, scatter_parser], + ) return parser def plot_correlation_args(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") # define the arguments - required.add_argument('--corData', '-in', - metavar='FILE', - help='Compressed matrix of values generated by multiBigwigSummary or multiBamSummary', - required=True) - - required.add_argument('--corMethod', '-c', - help="Correlation method.", - choices=['spearman', 'pearson'], - required=True) - - required.add_argument('--whatToPlot', '-p', - help="Choose between a heatmap or pairwise scatter plots", - choices=['heatmap', 'scatterplot'], - required=True) - - optional = parser.add_argument_group('Optional arguments') - optional.add_argument('--plotFile', '-o', - help='File to save the heatmap to. The file extension determines the format, ' - 'so heatmap.pdf will save the heatmap in PDF format. ' - 'The available formats are: .png, ' - '.eps, .pdf and .svg.', - type=writableFile, - metavar='FILE') - - optional.add_argument('--skipZeros', - help='By setting this option, genomic regions ' - 'that have zero or missing (nan) values in all samples ' - 'are excluded.', - action='store_true', - required=False) - - optional.add_argument('--labels', '-l', - metavar='sample1 sample2', - help='User defined labels instead of default labels from ' - 'file names. ' - 'Multiple labels have to be separated by spaces, e.g. ' - '--labels sample1 sample2 sample3', - nargs='+') - - optional.add_argument('--plotTitle', '-T', - help='Title of the plot, to be printed on top of ' - 'the generated image. Leave blank for no title. (Default: %(default)s)', - default='') - - optional.add_argument('--plotFileFormat', - metavar='FILETYPE', - help='Image format type. If given, this option ' - 'overrides the image format based on the plotFile ' - 'ending. The available options are: png, ' - 'eps, pdf and svg.', - choices=['png', 'pdf', 'svg', 'eps', 'plotly']) + required.add_argument( + "--corData", + "-in", + metavar="FILE", + help="Compressed matrix of values generated by multiBigwigSummary or multiBamSummary", + required=True, + ) + + required.add_argument( + "--corMethod", + "-c", + help="Correlation method.", + choices=["spearman", "pearson"], + required=True, + ) + + required.add_argument( + "--whatToPlot", + "-p", + help="Choose between a heatmap or pairwise scatter plots", + choices=["heatmap", "scatterplot"], + required=True, + ) + + optional = parser.add_argument_group("Optional arguments") + optional.add_argument( + "--plotFile", + "-o", + help="File to save the heatmap to. The file extension determines the format, " + "so heatmap.pdf will save the heatmap in PDF format. " + "The available formats are: .png, " + ".eps, .pdf and .svg.", + type=writableFile, + metavar="FILE", + ) + + optional.add_argument( + "--skipZeros", + help="By setting this option, genomic regions " + "that have zero or missing (nan) values in all samples " + "are excluded.", + action="store_true", + required=False, + ) + + optional.add_argument( + "--labels", + "-l", + metavar="sample1 sample2", + help="User defined labels instead of default labels from " + "file names. " + "Multiple labels have to be separated by spaces, e.g. " + "--labels sample1 sample2 sample3", + nargs="+", + ) + + optional.add_argument( + "--plotTitle", + "-T", + help="Title of the plot, to be printed on top of " + "the generated image. Leave blank for no title. (Default: %(default)s)", + default="", + ) optional.add_argument( - '--removeOutliers', - help='If set, bins with very large counts are removed. ' - 'Bins with abnormally high reads counts artificially increase ' - 'pearson correlation; that\'s why, multiBamSummary tries ' - 'to remove outliers using the median absolute deviation (MAD) ' - 'method applying a threshold of 200 to only consider extremely ' - 'large deviations from the median. The ENCODE blacklist page ' - '(https://sites.google.com/site/anshulkundaje/projects/blacklists) ' - 'contains useful information about regions with unusually high counts' - 'that may be worth removing.', - action='store_true') - - optional.add_argument('--version', action='version', - version='%(prog)s {}'.format(__version__)) - - group = parser.add_argument_group('Output optional options') - - group.add_argument('--outFileCorMatrix', - help='Save matrix with pairwise correlation values to a tab-separated file.', - metavar='FILE', - type=writableFile) + "--plotFileFormat", + metavar="FILETYPE", + help="Image format type. If given, this option " + "overrides the image format based on the plotFile " + "ending. The available options are: png, " + "eps, pdf and svg.", + choices=["png", "pdf", "svg", "eps", "plotly"], + ) + + optional.add_argument( + "--removeOutliers", + help="If set, bins with very large counts are removed. " + "Bins with abnormally high reads counts artificially increase " + "pearson correlation; that's why, multiBamSummary tries " + "to remove outliers using the median absolute deviation (MAD) " + "method applying a threshold of 200 to only consider extremely " + "large deviations from the median. The ENCODE blacklist page " + "(https://sites.google.com/site/anshulkundaje/projects/blacklists) " + "contains useful information about regions with unusually high counts" + "that may be worth removing.", + action="store_true", + ) + + optional.add_argument( + "--version", action="version", version="%(prog)s {}".format(__version__) + ) + + group = parser.add_argument_group("Output optional options") + + group.add_argument( + "--outFileCorMatrix", + help="Save matrix with pairwise correlation values to a tab-separated file.", + metavar="FILE", + type=writableFile, + ) return parser @@ -134,23 +162,29 @@ def scatterplot_options(): Options specific for creating the scatter plot """ parser = argparse.ArgumentParser(add_help=False) - scatter_opts = parser.add_argument_group('Scatter plot options') - - scatter_opts.add_argument('--xRange', - help='The X axis range. The default scales these such that the full range of dots is displayed.', - type=int, - nargs=2, - default=None) - - scatter_opts.add_argument('--yRange', - help='The Y axis range. The default scales these such that the full range of dots is displayed.', - type=int, - nargs=2, - default=None) - - scatter_opts.add_argument('--log1p', - help='Plot the natural log of the scatter plot after adding 1. Note that this is ONLY for plotting, the correlation is unaffected.', - action='store_true') + scatter_opts = parser.add_argument_group("Scatter plot options") + + scatter_opts.add_argument( + "--xRange", + help="The X axis range. The default scales these such that the full range of dots is displayed.", + type=int, + nargs=2, + default=None, + ) + + scatter_opts.add_argument( + "--yRange", + help="The Y axis range. The default scales these such that the full range of dots is displayed.", + type=int, + nargs=2, + default=None, + ) + + scatter_opts.add_argument( + "--log1p", + help="Plot the natural log of the scatter plot after adding 1. Note that this is ONLY for plotting, the correlation is unaffected.", + action="store_true", + ) return parser @@ -160,98 +194,122 @@ def heatmap_options(): Options for generating the correlation heatmap """ parser = argparse.ArgumentParser(add_help=False) - heatmap = parser.add_argument_group('Heatmap options') - - heatmap.add_argument('--plotHeight', - help='Plot height in cm. (Default: %(default)s)', - type=float, - default=9.5) - - heatmap.add_argument('--plotWidth', - help='Plot width in cm. The minimum value is 1 cm. (Default: %(default)s)', - type=float, - default=11) - - heatmap.add_argument('--zMin', '-min', - default=None, - help='Minimum value for the heatmap intensities. ' - 'If not specified, the value is set automatically', - type=float) - - heatmap.add_argument('--zMax', '-max', - default=None, - help='Maximum value for the heatmap intensities.' - 'If not specified, the value is set automatically', - type=float) + heatmap = parser.add_argument_group("Heatmap options") + + heatmap.add_argument( + "--plotHeight", + help="Plot height in cm. (Default: %(default)s)", + type=float, + default=9.5, + ) + + heatmap.add_argument( + "--plotWidth", + help="Plot width in cm. The minimum value is 1 cm. (Default: %(default)s)", + type=float, + default=11, + ) + + heatmap.add_argument( + "--zMin", + "-min", + default=None, + help="Minimum value for the heatmap intensities. " + "If not specified, the value is set automatically", + type=float, + ) heatmap.add_argument( - '--colorMap', default='jet', - metavar='', - help='Color map to use for the heatmap. Available values can be ' - 'seen here: ' - 'http://matplotlib.org/examples/color/colormaps_reference.html') - - heatmap.add_argument('--plotNumbers', - help='If set, then the correlation number is plotted ' - 'on top of the heatmap. This option is only valid when plotting a heatmap.', - action='store_true', - required=False) + "--zMax", + "-max", + default=None, + help="Maximum value for the heatmap intensities." + "If not specified, the value is set automatically", + type=float, + ) + + heatmap.add_argument( + "--colorMap", + default="jet", + metavar="", + help="Color map to use for the heatmap. Available values can be " + "seen here: " + "http://matplotlib.org/examples/color/colormaps_reference.html", + ) + + heatmap.add_argument( + "--plotNumbers", + help="If set, then the correlation number is plotted " + "on top of the heatmap. This option is only valid when plotting a heatmap.", + action="store_true", + required=False, + ) return parser def main(args=None): - args = parse_arguments().parse_args(args) if args.plotFile is None and args.outFileCorMatrix is None: - sys.exit("At least one of --plotFile and --outFileCorMatrix must be specified!\n") - - corr = Correlation(args.corData, - args.corMethod, - labels=args.labels, - remove_outliers=args.removeOutliers, - skip_zeros=args.skipZeros) - - if args.corMethod == 'pearson': + sys.exit( + "At least one of --plotFile and --outFileCorMatrix must be specified!\n" + ) + + corr = Correlation( + args.corData, + args.corMethod, + labels=args.labels, + remove_outliers=args.removeOutliers, + skip_zeros=args.skipZeros, + ) + + if args.corMethod == "pearson": # test if there are outliers and write a message recommending the removal if len(corr.get_outlier_indices(np.asarray(corr.matrix).flatten())) > 0: if args.removeOutliers: - sys.stderr.write("\nOutliers were detected in the data. They " - "will be removed to avoid bias " - "in the pearson correlation.\n") + sys.stderr.write( + "\nOutliers were detected in the data. They " + "will be removed to avoid bias " + "in the pearson correlation.\n" + ) else: - sys.stderr.write("\nOutliers were detected in the data. Consider " - "using the --removeOutliers parameter to avoid a bias " - "in the pearson correlation.\n") + sys.stderr.write( + "\nOutliers were detected in the data. Consider " + "using the --removeOutliers parameter to avoid a bias " + "in the pearson correlation.\n" + ) if args.colorMap: try: plt.get_cmap(args.colorMap) except ValueError as error: - sys.stderr.write( - "A problem was found. Message: {}\n".format(error)) + sys.stderr.write("A problem was found. Message: {}\n".format(error)) exit() if args.plotFile is not None: - if args.whatToPlot == 'scatterplot': - corr.plot_scatter(args.plotFile, - plot_title=args.plotTitle, - image_format=args.plotFileFormat, - xRange=args.xRange, - yRange=args.yRange, - log1p=args.log1p) + if args.whatToPlot == "scatterplot": + corr.plot_scatter( + args.plotFile, + plot_title=args.plotTitle, + image_format=args.plotFileFormat, + xRange=args.xRange, + yRange=args.yRange, + log1p=args.log1p, + ) else: - corr.plot_correlation(args.plotFile, - vmax=args.zMax, - vmin=args.zMin, - colormap=args.colorMap, - plot_title=args.plotTitle, - image_format=args.plotFileFormat, - plot_numbers=args.plotNumbers, - plotWidth=args.plotWidth, - plotHeight=args.plotHeight) + corr.plot_correlation( + args.plotFile, + vmax=args.zMax, + vmin=args.zMin, + colormap=args.colorMap, + plot_title=args.plotTitle, + image_format=args.plotFileFormat, + plot_numbers=args.plotNumbers, + plotWidth=args.plotWidth, + plotHeight=args.plotHeight, + ) if args.outFileCorMatrix: o = open(args.outFileCorMatrix, "w") diff --git a/deeptools/plotCoverage.py b/deeptools/plotCoverage.py index 02ce25dad..c3de63177 100755 --- a/deeptools/plotCoverage.py +++ b/deeptools/plotCoverage.py @@ -7,9 +7,10 @@ import numpy as np import matplotlib -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['svg.fonttype'] = 'none' + +matplotlib.use("Agg") +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["svg.fonttype"] = "none" from deeptools import cm # noqa: F401 import matplotlib.pyplot as plt import plotly.offline as py @@ -20,19 +21,18 @@ from deeptools.utilities import smartLabels from deeptools._version import __version__ -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def parse_arguments(args=None): parent_parser = parserCommon.getParentArgParse(binSize=False) read_options_parser = parserCommon.read_options() - parser = \ - argparse.ArgumentParser( - parents=[required_args(), parent_parser, read_options_parser], - formatter_class=argparse.RawDescriptionHelpFormatter, - add_help=False, - description=""" + parser = argparse.ArgumentParser( + parents=[required_args(), parent_parser, read_options_parser], + formatter_class=argparse.RawDescriptionHelpFormatter, + add_help=False, + description=""" This tool is useful to assess the sequencing depth of a given sample. It samples 1 million bp, counts the number of overlapping reads and can report @@ -43,13 +43,15 @@ def parse_arguments(args=None): $ plotCoverage -h """, - epilog='example usages:\nplotCoverage ' - '--bamfiles file1.bam file2.bam -o results.png\n\n' - ' \n\n', - conflict_handler='resolve') + epilog="example usages:\nplotCoverage " + "--bamfiles file1.bam file2.bam -o results.png\n\n" + " \n\n", + conflict_handler="resolve", + ) - parser.add_argument('--version', action='version', - version='plotCoverage {}'.format(__version__)) + parser.add_argument( + "--version", action="version", version="plotCoverage {}".format(__version__) + ) return parser @@ -70,101 +72,136 @@ def process_args(args=None): def required_args(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') - - required.add_argument('--bamfiles', '-b', - metavar='FILE1 FILE2', - help='List of indexed BAM files separated by spaces.', - nargs='+', - required=True) - - optional = parser.add_argument_group('Optional arguments') - - optional.add_argument("--help", "-h", action="help", - help="show this help message and exit") - - optional.add_argument('--plotFile', '-o', - type=parserCommon.writableFile, - help='File name to save the plot to.') - - optional.add_argument('--labels', '-l', - metavar='sample1 sample2', - help='User defined labels instead of default labels from ' - 'file names. ' - 'Multiple labels have to be separated by spaces, e.g. ' - '--labels sample1 sample2 sample3', - nargs='+') - - optional.add_argument('--smartLabels', - action='store_true', - help='Instead of manually specifying labels for the input ' - 'BAM files, this causes deepTools to use the file name ' - 'after removing the path and extension.') - - optional.add_argument('--plotTitle', '-T', - help='Title of the plot, to be printed on top of ' - 'the generated image. Leave blank for no title. (Default: %(default)s)', - default='') - - optional.add_argument('--skipZeros', - help='By setting this option, genomic regions ' - 'that have zero or nan values in _all_ samples ' - 'are excluded.', - action='store_true', - required=False) - - optional.add_argument('--numberOfSamples', '-n', - help='Number of 1 bp regions to sample. (Default: %(default)s)', - required=False, - type=int, - default=1000000) - - optional.add_argument('--BED', - help='Limits the coverage analysis to ' - 'the regions specified in these files. This overrides --numberOfSamples. ' - 'Due to memory requirements, it is inadvised to combine this with ' - '--outRawCounts or many tens of thousands of regions, as per-base ' - 'coverage is used!', - metavar='FILE1.bed FILE2.bed', - nargs='+') - - optional.add_argument('--outRawCounts', - help='Save raw counts (coverages) to file.', - type=parserCommon.writableFile, - metavar='FILE') - - optional.add_argument('--outCoverageMetrics', - help='Save percentage of bins/regions above the specified thresholds to ' - 'the specified file. The coverage thresholds are specified by ' - '--coverageThresholds. If no coverage thresholds are specified, the file ' - 'will be empty.', - type=parserCommon.writableFile, - metavar='FILE') - - optional.add_argument('--coverageThresholds', '-ct', - type=int, - action="append", - help='The percentage of reported bins/regions with signal at least as ' - 'high as the given threshold. This can be specified multiple times.') - - optional.add_argument('--plotHeight', - help='Plot height in cm. (Default: %(default)s)', - type=float, - default=5.0) - - optional.add_argument('--plotWidth', - help='Plot width in cm. The minimum value is 1 cm. (Default: %(default)s)', - type=float, - default=15.0) - - optional.add_argument('--plotFileFormat', - metavar='FILETYPE', - help='Image format type. If given, this option ' - 'overrides the image format based on the plotFile ' - 'ending. The available options are: png, ' - 'eps, pdf, svg and plotly.', - default=None, - choices=['png', 'pdf', 'svg', 'eps', 'plotly']) + required = parser.add_argument_group("Required arguments") + + required.add_argument( + "--bamfiles", + "-b", + metavar="FILE1 FILE2", + help="List of indexed BAM files separated by spaces.", + nargs="+", + required=True, + ) + + optional = parser.add_argument_group("Optional arguments") + + optional.add_argument( + "--help", "-h", action="help", help="show this help message and exit" + ) + + optional.add_argument( + "--plotFile", + "-o", + type=parserCommon.writableFile, + help="File name to save the plot to.", + ) + + optional.add_argument( + "--labels", + "-l", + metavar="sample1 sample2", + help="User defined labels instead of default labels from " + "file names. " + "Multiple labels have to be separated by spaces, e.g. " + "--labels sample1 sample2 sample3", + nargs="+", + ) + + optional.add_argument( + "--smartLabels", + action="store_true", + help="Instead of manually specifying labels for the input " + "BAM files, this causes deepTools to use the file name " + "after removing the path and extension.", + ) + + optional.add_argument( + "--plotTitle", + "-T", + help="Title of the plot, to be printed on top of " + "the generated image. Leave blank for no title. (Default: %(default)s)", + default="", + ) + + optional.add_argument( + "--skipZeros", + help="By setting this option, genomic regions " + "that have zero or nan values in _all_ samples " + "are excluded.", + action="store_true", + required=False, + ) + + optional.add_argument( + "--numberOfSamples", + "-n", + help="Number of 1 bp regions to sample. (Default: %(default)s)", + required=False, + type=int, + default=1000000, + ) + + optional.add_argument( + "--BED", + help="Limits the coverage analysis to " + "the regions specified in these files. This overrides --numberOfSamples. " + "Due to memory requirements, it is inadvised to combine this with " + "--outRawCounts or many tens of thousands of regions, as per-base " + "coverage is used!", + metavar="FILE1.bed FILE2.bed", + nargs="+", + ) + + optional.add_argument( + "--outRawCounts", + help="Save raw counts (coverages) to file.", + type=parserCommon.writableFile, + metavar="FILE", + ) + + optional.add_argument( + "--outCoverageMetrics", + help="Save percentage of bins/regions above the specified thresholds to " + "the specified file. The coverage thresholds are specified by " + "--coverageThresholds. If no coverage thresholds are specified, the file " + "will be empty.", + type=parserCommon.writableFile, + metavar="FILE", + ) + + optional.add_argument( + "--coverageThresholds", + "-ct", + type=int, + action="append", + help="The percentage of reported bins/regions with signal at least as " + "high as the given threshold. This can be specified multiple times.", + ) + + optional.add_argument( + "--plotHeight", + help="Plot height in cm. (Default: %(default)s)", + type=float, + default=5.0, + ) + + optional.add_argument( + "--plotWidth", + help="Plot width in cm. The minimum value is 1 cm. (Default: %(default)s)", + type=float, + default=15.0, + ) + + optional.add_argument( + "--plotFileFormat", + metavar="FILETYPE", + help="Image format type. If given, this option " + "overrides the image format based on the plotFile " + "ending. The available options are: png, " + "eps, pdf, svg and plotly.", + default=None, + choices=["png", "pdf", "svg", "eps", "plotly"], + ) return parser @@ -173,31 +210,35 @@ def main(args=None): args = process_args(args) if not args.outRawCounts and not args.plotFile and not args.outCoverageMetrics: - sys.exit("At least one of --plotFile, --outRawCounts and --outCoverageMetrics are required.\n") + sys.exit( + "At least one of --plotFile, --outRawCounts and --outCoverageMetrics are required.\n" + ) - if 'BED' in args: + if "BED" in args: bed_regions = args.BED else: bed_regions = None - cr = countR.CountReadsPerBin(args.bamfiles, - binLength=1, - bedFile=bed_regions, - numberOfSamples=args.numberOfSamples, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose, - region=args.region, - blackListFileName=args.blackListFileName, - extendReads=args.extendReads, - minMappingQuality=args.minMappingQuality, - ignoreDuplicates=args.ignoreDuplicates, - center_read=args.centerReads, - samFlag_include=args.samFlagInclude, - samFlag_exclude=args.samFlagExclude, - minFragmentLength=args.minFragmentLength, - maxFragmentLength=args.maxFragmentLength, - bed_and_bin=True, - out_file_for_raw_data=args.outRawCounts) + cr = countR.CountReadsPerBin( + args.bamfiles, + binLength=1, + bedFile=bed_regions, + numberOfSamples=args.numberOfSamples, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + region=args.region, + blackListFileName=args.blackListFileName, + extendReads=args.extendReads, + minMappingQuality=args.minMappingQuality, + ignoreDuplicates=args.ignoreDuplicates, + center_read=args.centerReads, + samFlag_include=args.samFlagInclude, + samFlag_exclude=args.samFlagExclude, + minFragmentLength=args.minFragmentLength, + maxFragmentLength=args.maxFragmentLength, + bed_and_bin=True, + out_file_for_raw_data=args.outRawCounts, + ) num_reads_per_bin = cr.run() @@ -209,7 +250,7 @@ def main(args=None): for thresh in args.coverageThresholds: vals = np.sum(num_reads_per_bin >= thresh, axis=0) for lab, val in zip(args.labels, vals): - of.write("{}\t{}\t{:6.3f}\n".format(lab, thresh, 100. * val / nbins)) + of.write("{}\t{}\t{:6.3f}\n".format(lab, thresh, 100.0 * val / nbins)) of.close() if args.outRawCounts: @@ -217,28 +258,46 @@ def main(args=None): # labels header = "#plotCoverage --outRawCounts\n#'chr'\t'start'\t'end'\t" header += "'" + "'\t'".join(args.labels) + "'\n" - f = open(args.outRawCounts, 'r+') + f = open(args.outRawCounts, "r+") content = f.read() f.seek(0, 0) f.write(header + content) f.close() if num_reads_per_bin.shape[0] < 2: - exit("ERROR: too few non-zero bins found.\n" - "If using --region please check that this " - "region is covered by reads.\n") + exit( + "ERROR: too few non-zero bins found.\n" + "If using --region please check that this " + "region is covered by reads.\n" + ) if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) if args.plotFile: - if args.plotFileFormat == 'plotly': + if args.plotFileFormat == "plotly": fig = go.Figure() - fig['layout']['xaxis1'] = {'domain': [0.0, 0.48], 'anchor': 'x1', 'title': 'coverage (#reads per base)'} - fig['layout']['xaxis2'] = {'domain': [0.52, 1.0], 'anchor': 'x2', 'title': 'coverage (#reads per base)'} - fig['layout']['yaxis1'] = {'domain': [0.0, 1.0], 'anchor': 'x1', 'title': 'fraction of bases sampled'} - fig['layout']['yaxis2'] = {'domain': [0.0, 1.0], 'anchor': 'x2', 'title': 'fraction of bases sampled >= coverage'} - fig['layout'].update(title=args.plotTitle) + fig["layout"]["xaxis1"] = { + "domain": [0.0, 0.48], + "anchor": "x1", + "title": "coverage (#reads per base)", + } + fig["layout"]["xaxis2"] = { + "domain": [0.52, 1.0], + "anchor": "x2", + "title": "coverage (#reads per base)", + } + fig["layout"]["yaxis1"] = { + "domain": [0.0, 1.0], + "anchor": "x1", + "title": "fraction of bases sampled", + } + fig["layout"]["yaxis2"] = { + "domain": [0.0, 1.0], + "anchor": "x2", + "title": "fraction of bases sampled >= coverage", + } + fig["layout"].update(title=args.plotTitle) else: fig, axs = plt.subplots(1, 2, figsize=(args.plotWidth, args.plotHeight)) plt.suptitle(args.plotTitle) @@ -271,69 +330,91 @@ def main(args=None): data = [] # We need to manually set the line colors so they're shared between the two plots. plotly_colors = ["#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4"] - plotly_styles = sum([6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"], 6 * ["dashdot"], 6 * ["longdashdot"]], []) + plotly_styles = sum( + [ + 6 * ["solid"], + 6 * ["dot"], + 6 * ["dash"], + 6 * ["longdash"], + 6 * ["dashdot"], + 6 * ["longdashdot"], + ], + [], + ) for idx, col in enumerate(num_reads_per_bin.T): if args.plotFile: - frac_reads_per_coverage = np.bincount(col.astype(int)).astype(float) / num_reads_per_bin.shape[0] + frac_reads_per_coverage = ( + np.bincount(col.astype(int)).astype(float) / num_reads_per_bin.shape[0] + ) csum = np.bincount(col.astype(int))[::-1].cumsum() csum_frac = csum.astype(float)[::-1] / csum.max() - if args.plotFileFormat == 'plotly': + if args.plotFileFormat == "plotly": color = plotly_colors[idx % len(plotly_colors)] dash = plotly_styles[idx % len(plotly_styles)] - trace = go.Scatter(x=np.arange(0, int(x_max) - 1), - y=frac_reads_per_coverage[:int(x_max)], - mode='lines', - xaxis='x1', - yaxis='y1', - line=dict(color=color, dash=dash), - name="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx]), - legendgroup="{}".format(idx)) + trace = go.Scatter( + x=np.arange(0, int(x_max) - 1), + y=frac_reads_per_coverage[: int(x_max)], + mode="lines", + xaxis="x1", + yaxis="y1", + line=dict(color=color, dash=dash), + name="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx]), + legendgroup="{}".format(idx), + ) data.append(trace) - trace = go.Scatter(x=np.arange(0, int(x_max) - 1), - y=csum_frac[:int(x_max)], - mode='lines', - xaxis='x2', - yaxis='y2', - line=dict(color=color, dash=dash), - name=args.labels[idx], - showlegend=False, - legendgroup="{}".format(idx)) + trace = go.Scatter( + x=np.arange(0, int(x_max) - 1), + y=csum_frac[: int(x_max)], + mode="lines", + xaxis="x2", + yaxis="y2", + line=dict(color=color, dash=dash), + name=args.labels[idx], + showlegend=False, + legendgroup="{}".format(idx), + ) data.append(trace) else: - axs[0].plot(frac_reads_per_coverage, label="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx])) + axs[0].plot( + frac_reads_per_coverage, + label="{}, mean={:.1f}".format(args.labels[idx], sample_mean[idx]), + ) axs[1].plot(csum_frac, label=args.labels[idx]) # find the indexes (i.e. the x values) for which the cumulative distribution 'fraction of bases # sampled >= coverage' where fraction of bases sampled = 50%: `np.flatnonzero(csum_frac>0.5)` # then find the fraction of bases sampled that that have the largest x y_max.append(frac_reads_per_coverage[max(np.flatnonzero(csum_frac > 0.5))]) - print("{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format(args.labels[idx], - sample_mean[idx], - sample_std[idx], - sample_min[idx], - sample_25[idx], - sample_50[idx], - sample_75[idx], - sample_max[idx], - )) + print( + "{}\t{:0.2f}\t{:0.2f}\t{}\t{}\t{}\t{}\t{}\t".format( + args.labels[idx], + sample_mean[idx], + sample_std[idx], + sample_min[idx], + sample_25[idx], + sample_50[idx], + sample_75[idx], + sample_max[idx], + ) + ) if args.plotFile: # Don't clip plots y_max = max(y_max) if args.plotFileFormat == "plotly": fig.add_traces(data) - fig['layout']['yaxis1'].update(range=[0.0, min(1, y_max + (y_max * 0.10))]) - fig['layout']['yaxis2'].update(range=[0.0, 1.0]) + fig["layout"]["yaxis1"].update(range=[0.0, min(1, y_max + (y_max * 0.10))]) + fig["layout"]["yaxis2"].update(range=[0.0, 1.0]) py.plot(fig, filename=args.plotFile, auto_open=False) else: axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10))) axs[0].set_xlim(0, x_max) - axs[0].set_xlabel('coverage (#reads per bp)') + axs[0].set_xlabel("coverage (#reads per bp)") axs[0].legend(fancybox=True, framealpha=0.5) - axs[0].set_ylabel('fraction of bases sampled') + axs[0].set_ylabel("fraction of bases sampled") # plot cumulative coverage axs[1].set_xlim(0, x_max) - axs[1].set_xlabel('coverage (#reads per bp)') - axs[1].set_ylabel('fraction of bases sampled >= coverage') + axs[1].set_xlabel("coverage (#reads per bp)") + axs[1].set_ylabel("fraction of bases sampled >= coverage") axs[1].legend(fancybox=True, framealpha=0.5) plt.savefig(args.plotFile, format=args.plotFileFormat) plt.close() diff --git a/deeptools/plotEnrichment.py b/deeptools/plotEnrichment.py index 7ef474eff..89469bc1d 100755 --- a/deeptools/plotEnrichment.py +++ b/deeptools/plotEnrichment.py @@ -5,9 +5,10 @@ import argparse import numpy as np import matplotlib -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['svg.fonttype'] = 'none' + +matplotlib.use("Agg") +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["svg.fonttype"] = "none" from deeptools import cm # noqa: F401 import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec @@ -24,7 +25,7 @@ from deeptools import parserCommon -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def parse_arguments(args=None): @@ -48,152 +49,195 @@ def parse_arguments(args=None): plotEnrichment -h """, - epilog='example usages:\n' - 'plotEnrichment -b file1.bam file2.bam --BED peaks.bed -o enrichment.png\n\n' - ' \n\n', - parents=[basic_args, parent_parser, read_options]) + epilog="example usages:\n" + "plotEnrichment -b file1.bam file2.bam --BED peaks.bed -o enrichment.png\n\n" + " \n\n", + parents=[basic_args, parent_parser, read_options], + ) return parser def plot_enrichment_args(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") # define the arguments - required.add_argument('--bamfiles', '-b', - metavar='file1.bam file2.bam', - help='List of indexed bam files separated by spaces.', - nargs='+', - required=True) - - required.add_argument('--BED', - help='Limits the enrichment analysis to ' - 'the regions specified in these BED/GTF files. Enrichment ' - 'is calculated as the number of reads overlapping each ' - 'feature type. The feature type is column 3 in a GTF file ' - 'and "peak" for BED files.', - metavar='FILE1.bed FILE2.bed', - nargs='+', - required=True) - - optional = parser.add_argument_group('Optional arguments') - - optional.add_argument('--plotFile', '-o', - help='File to save the plot to. The file extension determines the format, ' - 'so heatmap.pdf will save the heatmap in PDF format. ' - 'The available formats are: .png, ' - '.eps, .pdf and .svg.', - type=parserCommon.writableFile, - metavar='FILE') - - optional.add_argument('--attributeKey', - help='Instead of deriving labels from the feature column in a GTF file, ' - 'use the given attribute key, such as gene_biotype. For BED files or ' - 'entries without the attribute key, None is used as the label.') - - optional.add_argument('--labels', '-l', - metavar='sample1 sample2', - help='User defined labels instead of default labels from ' - 'file names. ' - 'Multiple labels have to be separated by spaces, e.g. ' - '--labels sample1 sample2 sample3', - nargs='+') - - optional.add_argument('--smartLabels', - action='store_true', - help='Instead of manually specifying labels for the input ' - 'BAM/BED/GTF files, this causes deepTools to use the file name ' - 'after removing the path and extension. For BED/GTF files, the ' - 'eventual region name will be overriden if specified inside ' - 'the file.') - - optional.add_argument('--regionLabels', - metavar="region1 region2", - help="For BED files, the label given to its region is " - "the file name, but this can be overridden by providing " - "a custom label. For GTF files this is ignored. Note " - "that if you provide labels, you MUST provide one for each " - "BED/GTF file, even though it will be ignored for GTF files.", - nargs='+') - - optional.add_argument('--plotTitle', '-T', - help='Title of the plot, to be printed on top of ' - 'the generated image. Leave blank for no title. (Default: %(default)s)', - default='') - - optional.add_argument('--plotFileFormat', - metavar='FILETYPE', - help='Image format type. If given, this option ' - 'overrides the image format based on the plotFile ' - 'ending. The available options are: png, ' - 'eps, pdf, plotly and svg.', - choices=['png', 'pdf', 'svg', 'eps', 'plotly']) - - optional.add_argument('--outRawCounts', - help='Save the counts per region to a tab-delimited file.', - type=parserCommon.writableFile, - metavar='FILE') - - optional.add_argument('--perSample', - help='Group the plots by sample, rather than by feature type (the default).', - action='store_true') - - optional.add_argument('--variableScales', - help='By default, the y-axes are always 0-100. This allows the axis range to be restricted.', - action='store_true') - - optional.add_argument('--plotHeight', - help='Plot height in cm. (Default: %(default)s)', - type=float, - default=20) - - optional.add_argument('--plotWidth', - help='Plot width in cm. The minimum value is 1 cm. (Default: %(default)s)', - type=float, - default=20) - - optional.add_argument('--colors', - help='List of colors to use ' - 'for the plotted lines. Color names ' - 'and html hex strings (e.g., #eeff22) ' - 'are accepted. The color names should ' - 'be space separated. For example, ' - '--colors red blue green ', - nargs='+') - - optional.add_argument('--numPlotsPerRow', - help='Number of plots per row (Default: %(default)s)', - type=int, - default=4) - - optional.add_argument('--alpha', - default=0.9, - type=parserCommon.check_float_0_1, - help='The alpha channel (transparency) to use for the bars. ' - 'The default is 0.9 and values must be between 0 and 1.') - - optional.add_argument('--Offset', - help='Uses this offset inside of each read as the signal. This is useful in ' - 'cases like RiboSeq or GROseq, where the signal is 12, 15 or 0 bases past the ' - 'start of the read. This can be paired with the --filterRNAstrand option. ' - 'Note that negative values indicate offsets from the end of each read. A value ' - 'of 1 indicates the first base of the alignment (taking alignment orientation ' - 'into account). Likewise, a value of -1 is the last base of the alignment. An ' - 'offset of 0 is not permitted. If two values are specified, then they will be ' - 'used to specify a range of positions. Note that specifying something like ' - '--Offset 5 -1 will result in the 5th through last position being used, which ' - 'is equivalent to trimming 4 bases from the 5-prime end of alignments.', - metavar='INT', - type=int, - nargs='+', - required=False) - - bed12 = parser.add_argument_group('BED12 arguments') - - bed12.add_argument('--keepExons', - help="For BED12 files, use each exon as a region, rather than columns 2/3", - action="store_true") + required.add_argument( + "--bamfiles", + "-b", + metavar="file1.bam file2.bam", + help="List of indexed bam files separated by spaces.", + nargs="+", + required=True, + ) + + required.add_argument( + "--BED", + help="Limits the enrichment analysis to " + "the regions specified in these BED/GTF files. Enrichment " + "is calculated as the number of reads overlapping each " + "feature type. The feature type is column 3 in a GTF file " + 'and "peak" for BED files.', + metavar="FILE1.bed FILE2.bed", + nargs="+", + required=True, + ) + + optional = parser.add_argument_group("Optional arguments") + + optional.add_argument( + "--plotFile", + "-o", + help="File to save the plot to. The file extension determines the format, " + "so heatmap.pdf will save the heatmap in PDF format. " + "The available formats are: .png, " + ".eps, .pdf and .svg.", + type=parserCommon.writableFile, + metavar="FILE", + ) + + optional.add_argument( + "--attributeKey", + help="Instead of deriving labels from the feature column in a GTF file, " + "use the given attribute key, such as gene_biotype. For BED files or " + "entries without the attribute key, None is used as the label.", + ) + + optional.add_argument( + "--labels", + "-l", + metavar="sample1 sample2", + help="User defined labels instead of default labels from " + "file names. " + "Multiple labels have to be separated by spaces, e.g. " + "--labels sample1 sample2 sample3", + nargs="+", + ) + + optional.add_argument( + "--smartLabels", + action="store_true", + help="Instead of manually specifying labels for the input " + "BAM/BED/GTF files, this causes deepTools to use the file name " + "after removing the path and extension. For BED/GTF files, the " + "eventual region name will be overriden if specified inside " + "the file.", + ) + + optional.add_argument( + "--regionLabels", + metavar="region1 region2", + help="For BED files, the label given to its region is " + "the file name, but this can be overridden by providing " + "a custom label. For GTF files this is ignored. Note " + "that if you provide labels, you MUST provide one for each " + "BED/GTF file, even though it will be ignored for GTF files.", + nargs="+", + ) + + optional.add_argument( + "--plotTitle", + "-T", + help="Title of the plot, to be printed on top of " + "the generated image. Leave blank for no title. (Default: %(default)s)", + default="", + ) + + optional.add_argument( + "--plotFileFormat", + metavar="FILETYPE", + help="Image format type. If given, this option " + "overrides the image format based on the plotFile " + "ending. The available options are: png, " + "eps, pdf, plotly and svg.", + choices=["png", "pdf", "svg", "eps", "plotly"], + ) + + optional.add_argument( + "--outRawCounts", + help="Save the counts per region to a tab-delimited file.", + type=parserCommon.writableFile, + metavar="FILE", + ) + + optional.add_argument( + "--perSample", + help="Group the plots by sample, rather than by feature type (the default).", + action="store_true", + ) + + optional.add_argument( + "--variableScales", + help="By default, the y-axes are always 0-100. This allows the axis range to be restricted.", + action="store_true", + ) + + optional.add_argument( + "--plotHeight", + help="Plot height in cm. (Default: %(default)s)", + type=float, + default=20, + ) + + optional.add_argument( + "--plotWidth", + help="Plot width in cm. The minimum value is 1 cm. (Default: %(default)s)", + type=float, + default=20, + ) + + optional.add_argument( + "--colors", + help="List of colors to use " + "for the plotted lines. Color names " + "and html hex strings (e.g., #eeff22) " + "are accepted. The color names should " + "be space separated. For example, " + "--colors red blue green ", + nargs="+", + ) + + optional.add_argument( + "--numPlotsPerRow", + help="Number of plots per row (Default: %(default)s)", + type=int, + default=4, + ) + + optional.add_argument( + "--alpha", + default=0.9, + type=parserCommon.check_float_0_1, + help="The alpha channel (transparency) to use for the bars. " + "The default is 0.9 and values must be between 0 and 1.", + ) + + optional.add_argument( + "--Offset", + help="Uses this offset inside of each read as the signal. This is useful in " + "cases like RiboSeq or GROseq, where the signal is 12, 15 or 0 bases past the " + "start of the read. This can be paired with the --filterRNAstrand option. " + "Note that negative values indicate offsets from the end of each read. A value " + "of 1 indicates the first base of the alignment (taking alignment orientation " + "into account). Likewise, a value of -1 is the last base of the alignment. An " + "offset of 0 is not permitted. If two values are specified, then they will be " + "used to specify a range of positions. Note that specifying something like " + "--Offset 5 -1 will result in the 5th through last position being used, which " + "is equivalent to trimming 4 bases from the 5-prime end of alignments.", + metavar="INT", + type=int, + nargs="+", + required=False, + ) + + bed12 = parser.add_argument_group("BED12 arguments") + + bed12.add_argument( + "--keepExons", + help="For BED12 files, use each exon as a region, rather than columns 2/3", + action="store_true", + ) return parser @@ -207,7 +251,7 @@ def getBAMBlocks(read, defaultFragmentLength, centerRead, offset=None): if defaultFragmentLength != "read length": maxPairedFragmentLength = 4 * defaultFragmentLength - if defaultFragmentLength == 'read length': + if defaultFragmentLength == "read length": blocks = read.get_blocks() else: if cr.is_proper_pair(read, maxPairedFragmentLength): @@ -232,8 +276,11 @@ def getBAMBlocks(read, defaultFragmentLength, centerRead, offset=None): fragmentStart = fragmentCenter - read.infer_query_length(always=False) / 2 fragmentEnd = fragmentStart + read.infer_query_length(always=False) - assert fragmentStart < fragmentEnd, "fragment start greater than fragment" \ - "end for read {}".format(read.query_name) + assert ( + fragmentStart < fragmentEnd + ), "fragment start greater than fragment" "end for read {}".format( + read.query_name + ) blocks = [(int(fragmentStart), int(fragmentEnd))] # Handle read offsets, if needed @@ -262,7 +309,7 @@ def getBAMBlocks(read, defaultFragmentLength, centerRead, offset=None): if read.is_reverse: stretch = stretch[::-1] try: - foo = stretch[offset[0]:offset[1]] + foo = stretch[offset[0] : offset[1]] except: return rv @@ -273,7 +320,9 @@ def getBAMBlocks(read, defaultFragmentLength, centerRead, offset=None): # Convert the stretch back to a list of tuples foo = np.array(foo) d = foo[1:] - foo[:-1] - idx = np.argwhere(d > 1).flatten().tolist() # This now holds the interval bounds as a list + idx = ( + np.argwhere(d > 1).flatten().tolist() + ) # This now holds the interval bounds as a list idx.append(-1) last = 0 blocks = [] @@ -316,7 +365,10 @@ def getEnrichment_worker(arglist): continue if args.minMappingQuality and read.mapq < args.minMappingQuality: continue - if args.samFlagInclude and read.flag & args.samFlagInclude != args.samFlagInclude: + if ( + args.samFlagInclude + and read.flag & args.samFlagInclude != args.samFlagInclude + ): continue if args.samFlagExclude and read.flag & args.samFlagExclude != 0: continue @@ -335,8 +387,11 @@ def getEnrichment_worker(arglist): e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext - if lpos is not None and lpos == read.reference_start \ - and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: + if ( + lpos is not None + and lpos == read.reference_start + and (s, e, read.next_reference_id, read.is_reverse) in prev_pos + ): continue if lpos != read.reference_start: prev_pos.clear() @@ -345,7 +400,12 @@ def getEnrichment_worker(arglist): total[idx] += 1 # Get blocks, possibly extending - features = gtf.findOverlaps(chrom, getBAMBlocks(read, defaultFragmentLength, args.centerReads, args.Offset)) + features = gtf.findOverlaps( + chrom, + getBAMBlocks( + read, defaultFragmentLength, args.centerReads, args.Offset + ), + ) if features is not None and len(features) > 0: for x in features: @@ -367,19 +427,25 @@ def plotEnrichment(args, featureCounts, totalCounts, features): # Handle the colors if not args.colors: - cmap_plot = plt.get_cmap('jet') - args.colors = cmap_plot(np.arange(barsPerPlot, dtype=float) / float(barsPerPlot)) - if args.plotFileFormat == 'plotly': + cmap_plot = plt.get_cmap("jet") + args.colors = cmap_plot( + np.arange(barsPerPlot, dtype=float) / float(barsPerPlot) + ) + if args.plotFileFormat == "plotly": args.colors = range(barsPerPlot) elif len(args.colors) < barsPerPlot: - sys.exit("Error: {0} colors were requested, but {1} were needed!".format(len(args.colors), barsPerPlot)) + sys.exit( + "Error: {0} colors were requested, but {1} were needed!".format( + len(args.colors), barsPerPlot + ) + ) data = [] - if args.plotFileFormat == 'plotly': + if args.plotFileFormat == "plotly": fig = go.Figure() - fig['layout'].update(title=args.plotTitle) - domainWidth = .9 / cols - domainHeight = .9 / rows + fig["layout"].update(title=args.plotTitle) + domainWidth = 0.9 / cols + domainHeight = 0.9 / rows bufferHeight = 0.0 if rows > 1: bufferHeight = 0.1 / (rows - 1) @@ -388,7 +454,7 @@ def plotEnrichment(args, featureCounts, totalCounts, features): bufferWidth = 0.1 / (cols - 1) else: grids = gridspec.GridSpec(rows, cols) - plt.rcParams['font.size'] = 10.0 + plt.rcParams["font.size"] = 10.0 # convert cm values to inches fig = plt.figure(figsize=(args.plotWidth / 2.54, args.plotHeight / 2.54)) @@ -402,44 +468,66 @@ def plotEnrichment(args, featureCounts, totalCounts, features): xlabels = features ylabel = "% alignments in {0}".format(args.labels[i]) vals = [featureCounts[i][foo] for foo in features] - vals = 100 * np.array(vals, dtype='float64') / totalCounts[i] + vals = 100 * np.array(vals, dtype="float64") / totalCounts[i] else: xlabels = args.labels ylabel = "% {0}".format(features[i]) vals = [foo[features[i]] for foo in featureCounts] - vals = 100 * np.array(vals, dtype='float64') / np.array(totalCounts, dtype='float64') - - if args.plotFileFormat == 'plotly': - xanchor = 'x{}'.format(i + 1) - yanchor = 'y{}'.format(i + 1) + vals = ( + 100 + * np.array(vals, dtype="float64") + / np.array(totalCounts, dtype="float64") + ) + + if args.plotFileFormat == "plotly": + xanchor = "x{}".format(i + 1) + yanchor = "y{}".format(i + 1) base = row * (domainHeight + bufferHeight) domain = [base, base + domainHeight] - fig['layout']['xaxis{}'.format(i + 1)] = {'domain': domain, 'anchor': yanchor} + fig["layout"]["xaxis{}".format(i + 1)] = { + "domain": domain, + "anchor": yanchor, + } base = col * (domainWidth + bufferWidth) domain = [base, base + domainWidth] - fig['layout']['yaxis{}'.format(i + 1)] = {'domain': domain, 'anchor': xanchor, 'title': ylabel} + fig["layout"]["yaxis{}".format(i + 1)] = { + "domain": domain, + "anchor": xanchor, + "title": ylabel, + } if args.variableScales is False: - fig['layout']['yaxis{}'.format(i + 1)].update(range=[0, 100]) - trace = go.Bar(x=xlabels, - y=vals, - opacity=args.alpha, - orientation='v', - showlegend=False, - xaxis=xanchor, - yaxis=yanchor, - name=ylabel, - marker={'color': args.colors, 'line': {'color': args.colors}}) + fig["layout"]["yaxis{}".format(i + 1)].update(range=[0, 100]) + trace = go.Bar( + x=xlabels, + y=vals, + opacity=args.alpha, + orientation="v", + showlegend=False, + xaxis=xanchor, + yaxis=yanchor, + name=ylabel, + marker={"color": args.colors, "line": {"color": args.colors}}, + ) data.append(trace) else: ax = plt.subplot(grids[row, col]) - ax.bar(np.arange(vals.shape[0]), vals, width=1.0, bottom=0.0, align='center', color=args.colors, edgecolor=args.colors, alpha=args.alpha) + ax.bar( + np.arange(vals.shape[0]), + vals, + width=1.0, + bottom=0.0, + align="center", + color=args.colors, + edgecolor=args.colors, + alpha=args.alpha, + ) ax.set_ylabel(ylabel) ax.set_xticks(np.arange(vals.shape[0])) - ax.set_xticklabels(xlabels, rotation='vertical') + ax.set_xticklabels(xlabels, rotation="vertical") if args.variableScales is False: ax.set_ylim(0.0, 100.0) - if args.plotFileFormat == 'plotly': + if args.plotFileFormat == "plotly": fig.add_traces(data) py.plot(fig, filename=args.plotFile, auto_open=False) # colors @@ -462,8 +550,12 @@ def getChunkLength(args, chromSize): """ if args.region: - chromSize, region_start, region_end, genomeChunkLength = getUserRegion(chromSize, args.region) - rv = np.ceil((region_start - region_end) / float(4 * args.numberOfProcessors)).astype(int) + chromSize, region_start, region_end, genomeChunkLength = getUserRegion( + chromSize, args.region + ) + rv = np.ceil( + (region_start - region_end) / float(4 * args.numberOfProcessors) + ).astype(int) return max(1, rv) bl = None @@ -488,18 +580,23 @@ def getChunkLength(args, chromSize): def main(args=None): - args = parse_arguments().parse_args(args) if not args.outRawCounts and not args.plotFile: - sys.exit("Error: You need to specify at least one of --plotFile or --outRawCounts!\n") + sys.exit( + "Error: You need to specify at least one of --plotFile or --outRawCounts!\n" + ) if args.labels is None: args.labels = args.bamfiles if args.smartLabels: args.labels = smartLabels(args.bamfiles) if len(args.labels) != len(args.bamfiles): - sys.exit("Error: The number of labels ({0}) does not match the number of BAM files ({1})!".format(len(args.labels), len(args.bamfiles))) + sys.exit( + "Error: The number of labels ({0}) does not match the number of BAM files ({1})!".format( + len(args.labels), len(args.bamfiles) + ) + ) # Ensure that if we're given an attributeKey that it's not empty if args.attributeKey and args.attributeKey == "": @@ -508,7 +605,12 @@ def main(args=None): global gtf if not args.regionLabels and args.smartLabels: args.regionLabels = smartLabels(args.BED) - gtf = Enrichment(args.BED, keepExons=args.keepExons, labels=args.regionLabels, attributeKey=args.attributeKey) + gtf = Enrichment( + args.BED, + keepExons=args.keepExons, + labels=args.regionLabels, + attributeKey=args.attributeKey, + ) # Get fragment size and chromosome dict fhs = [openBam(x) for x in args.bamfiles] @@ -516,44 +618,58 @@ def main(args=None): for fh in fhs: fh.close() - frag_len_dict, read_len_dict = get_read_and_fragment_length(args.bamfiles[0], - return_lengths=False, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose) + frag_len_dict, read_len_dict = get_read_and_fragment_length( + args.bamfiles[0], + return_lengths=False, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + ) if args.extendReads: if args.extendReads is True: # try to guess fragment length if the bam file contains paired end reads if frag_len_dict: - defaultFragmentLength = frag_len_dict['median'] + defaultFragmentLength = frag_len_dict["median"] else: - sys.exit("*ERROR*: library is not paired-end. Please provide an extension length.") + sys.exit( + "*ERROR*: library is not paired-end. Please provide an extension length." + ) if args.verbose: - print("Fragment length based on paired en data " - "estimated to be {0}".format(frag_len_dict['median'])) - elif args.extendReads < read_len_dict['median']: - sys.stderr.write("*WARNING*: read extension is smaller than read length (read length = {}). " - "Reads will not be extended.\n".format(int(read_len_dict['median']))) - defaultFragmentLength = 'read length' + print( + "Fragment length based on paired en data " + "estimated to be {0}".format(frag_len_dict["median"]) + ) + elif args.extendReads < read_len_dict["median"]: + sys.stderr.write( + "*WARNING*: read extension is smaller than read length (read length = {}). " + "Reads will not be extended.\n".format(int(read_len_dict["median"])) + ) + defaultFragmentLength = "read length" elif args.extendReads > 2000: - sys.exit("*ERROR*: read extension must be smaller that 2000. Value give: {} ".format(args.extendReads)) + sys.exit( + "*ERROR*: read extension must be smaller that 2000. Value give: {} ".format( + args.extendReads + ) + ) else: defaultFragmentLength = args.extendReads else: - defaultFragmentLength = 'read length' + defaultFragmentLength = "read length" # Get the chunkLength chunkLength = getChunkLength(args, chromSize) # Map reduce to get the counts/file/feature - res = mapReduce([args, defaultFragmentLength], - getEnrichment_worker, - chromSize, - genomeChunkLength=chunkLength, - region=args.region, - blackListFileName=args.blackListFileName, - numberOfProcessors=args.numberOfProcessors, - verbose=args.verbose) + res = mapReduce( + [args, defaultFragmentLength], + getEnrichment_worker, + chromSize, + genomeChunkLength=chunkLength, + region=args.region, + blackListFileName=args.blackListFileName, + numberOfProcessors=args.numberOfProcessors, + verbose=args.verbose, + ) features = res[0][1] featureCounts = [] @@ -582,5 +698,9 @@ def main(args=None): of.write("file\tfeatureType\tpercent\tfeatureReadCount\ttotalReadCount\n") for i, x in enumerate(args.labels): for k, v in featureCounts[i].items(): - of.write("{0}\t{1}\t{2:5.2f}\t{3}\t{4}\n".format(x, k, (100.0 * v) / totalCounts[i], v, totalCounts[i])) + of.write( + "{0}\t{1}\t{2:5.2f}\t{3}\t{4}\n".format( + x, k, (100.0 * v) / totalCounts[i], v, totalCounts[i] + ) + ) of.close() diff --git a/deeptools/plotFingerprint.py b/deeptools/plotFingerprint.py index 4aee5b470..fc0db7f87 100755 --- a/deeptools/plotFingerprint.py +++ b/deeptools/plotFingerprint.py @@ -5,9 +5,10 @@ import argparse import sys import matplotlib -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['svg.fonttype'] = 'none' + +matplotlib.use("Agg") +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["svg.fonttype"] = "none" from deeptools import cm # noqa: F401 import matplotlib.pyplot as plt from scipy import interpolate @@ -21,7 +22,7 @@ from deeptools import parserCommon from deeptools.utilities import smartLabels -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") MAXLEN = 10000000 @@ -32,25 +33,30 @@ def parse_arguments(args=None): optional_args = get_optional_args() read_options_parser = parserCommon.read_options() parser = argparse.ArgumentParser( - parents=[required_args, output_args, read_options_parser, - optional_args, parent_parser], + parents=[ + required_args, + output_args, + read_options_parser, + optional_args, + parent_parser, + ], formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='This tool samples indexed BAM files ' - 'and plots a profile of cumulative read coverages for each. ' - 'All reads overlapping a window (bin) of the ' - 'specified length are counted; ' - 'these counts are sorted ' - 'and the cumulative sum is finally plotted. ', - conflict_handler='resolve', - usage='An example usage is: plotFingerprint -b treatment.bam control.bam ' - '-plot fingerprint.png', - add_help=False) + description="This tool samples indexed BAM files " + "and plots a profile of cumulative read coverages for each. " + "All reads overlapping a window (bin) of the " + "specified length are counted; " + "these counts are sorted " + "and the cumulative sum is finally plotted. ", + conflict_handler="resolve", + usage="An example usage is: plotFingerprint -b treatment.bam control.bam " + "-plot fingerprint.png", + add_help=False, + ) return parser def process_args(args=None): - args = parse_arguments().parse_args(args) if args.JSDsample is not None and args.JSDsample not in args.bamfiles: @@ -72,109 +78,140 @@ def process_args(args=None): def get_required_args(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") # define the arguments - required.add_argument('--bamfiles', '-b', - metavar='bam files', - nargs='+', - help='List of indexed BAM files', - required=True) + required.add_argument( + "--bamfiles", + "-b", + metavar="bam files", + nargs="+", + help="List of indexed BAM files", + required=True, + ) return parser def get_optional_args(): - parser = argparse.ArgumentParser(add_help=False, - conflict_handler='resolve') - optional = parser.add_argument_group('Optional arguments') - optional.add_argument("--help", "-h", action="help", - help="show this help message and exit") - - optional.add_argument('--labels', '-l', - metavar='', - help='List of labels to use in the output. ' - 'If not given, the file names will be used instead. ' - 'Separate the labels by spaces.', - nargs='+') - - optional.add_argument('--smartLabels', - action='store_true', - help='Instead of manually specifying labels for the input ' - 'BAM/bigWig files, this causes deepTools to use the file name ' - 'after removing the path and extension.') - - optional.add_argument('--binSize', '-bs', - help='Window size in base pairs to ' - 'sample the genome. This times --numberOfSamples should be less than the genome size. (Default: %(default)s)', - default=500, - type=int) - - optional.add_argument('--numberOfSamples', '-n', - help='The number of bins that are sampled from the genome, ' - 'for which the overlapping number of reads is computed. (Default: %(default)s)', - default=5e5, - type=int) - - optional.add_argument('--plotFileFormat', - metavar='', - help='image format type. If given, this option ' - 'overrides the image format based on the ending ' - 'given via --plotFile ' - 'ending. The available options are: "png", ' - '"eps", "pdf", "plotly" and "svg"', - choices=['png', 'pdf', 'svg', 'eps', 'plotly']) - - optional.add_argument('--plotTitle', '-T', - help='Title of the plot, to be printed on top of ' - 'the generated image. Leave blank for no title. (Default: %(default)s)', - default='') - - optional.add_argument('--skipZeros', - help='If set, then regions with zero overlapping reads' - 'for *all* given BAM files are ignored. This ' - 'will result in a reduced number of read ' - 'counts than that specified in --numberOfSamples', - action='store_true') - - optional.add_argument('--outQualityMetrics', - help='Quality metrics can optionally be output to ' - 'this file. The file will have one row per input BAM ' - 'file and columns containing a number of metrics. ' - 'Please see the online documentation for a longer ' - 'explanation: http://deeptools.readthedocs.io/en/latest/content/feature/plotFingerprint_QC_metrics.html .', - type=parserCommon.writableFile, - metavar='FILE.txt') - - optional.add_argument('--JSDsample', - help='Reference sample against which to compute the ' - 'Jensen-Shannon distance and the CHANCE statistics. ' - 'If this is not specified, ' - 'then these will not be calculated. If ' - '--outQualityMetrics is not specified then this will ' - 'be ignored. The Jensen-Shannon implementation is ' - 'based on code from Sitanshu Gakkhar at BCGSC. The ' - 'CHANCE implementation is based on code from Matthias ' - 'Haimel.', - metavar='sample.bam') + parser = argparse.ArgumentParser(add_help=False, conflict_handler="resolve") + optional = parser.add_argument_group("Optional arguments") + optional.add_argument( + "--help", "-h", action="help", help="show this help message and exit" + ) + + optional.add_argument( + "--labels", + "-l", + metavar="", + help="List of labels to use in the output. " + "If not given, the file names will be used instead. " + "Separate the labels by spaces.", + nargs="+", + ) + + optional.add_argument( + "--smartLabels", + action="store_true", + help="Instead of manually specifying labels for the input " + "BAM/bigWig files, this causes deepTools to use the file name " + "after removing the path and extension.", + ) + + optional.add_argument( + "--binSize", + "-bs", + help="Window size in base pairs to " + "sample the genome. This times --numberOfSamples should be less than the genome size. (Default: %(default)s)", + default=500, + type=int, + ) + + optional.add_argument( + "--numberOfSamples", + "-n", + help="The number of bins that are sampled from the genome, " + "for which the overlapping number of reads is computed. (Default: %(default)s)", + default=5e5, + type=int, + ) + + optional.add_argument( + "--plotFileFormat", + metavar="", + help="image format type. If given, this option " + "overrides the image format based on the ending " + "given via --plotFile " + 'ending. The available options are: "png", ' + '"eps", "pdf", "plotly" and "svg"', + choices=["png", "pdf", "svg", "eps", "plotly"], + ) + + optional.add_argument( + "--plotTitle", + "-T", + help="Title of the plot, to be printed on top of " + "the generated image. Leave blank for no title. (Default: %(default)s)", + default="", + ) + + optional.add_argument( + "--skipZeros", + help="If set, then regions with zero overlapping reads" + "for *all* given BAM files are ignored. This " + "will result in a reduced number of read " + "counts than that specified in --numberOfSamples", + action="store_true", + ) + + optional.add_argument( + "--outQualityMetrics", + help="Quality metrics can optionally be output to " + "this file. The file will have one row per input BAM " + "file and columns containing a number of metrics. " + "Please see the online documentation for a longer " + "explanation: http://deeptools.readthedocs.io/en/latest/content/feature/plotFingerprint_QC_metrics.html .", + type=parserCommon.writableFile, + metavar="FILE.txt", + ) + + optional.add_argument( + "--JSDsample", + help="Reference sample against which to compute the " + "Jensen-Shannon distance and the CHANCE statistics. " + "If this is not specified, " + "then these will not be calculated. If " + "--outQualityMetrics is not specified then this will " + "be ignored. The Jensen-Shannon implementation is " + "based on code from Sitanshu Gakkhar at BCGSC. The " + "CHANCE implementation is based on code from Matthias " + "Haimel.", + metavar="sample.bam", + ) return parser def get_output_args(): parser = argparse.ArgumentParser(add_help=False) - group = parser.add_argument_group('Output') - group.add_argument('--plotFile', '-plot', '-o', - help='File name of the output figure. The file ' - 'ending will be used to determine the image ' - 'format. The available options are typically: "png", ' - '"eps", "pdf" and "svg", e.g. : fingerprint.png.', - type=parserCommon.writableFile, - metavar='') - - group.add_argument('--outRawCounts', - help='Output file name to save the read counts per bin.', - type=parserCommon.writableFile, - metavar='') + group = parser.add_argument_group("Output") + group.add_argument( + "--plotFile", + "-plot", + "-o", + help="File name of the output figure. The file " + "ending will be used to determine the image " + 'format. The available options are typically: "png", ' + '"eps", "pdf" and "svg", e.g. : fingerprint.png.', + type=parserCommon.writableFile, + metavar="", + ) + + group.add_argument( + "--outRawCounts", + help="Output file name to save the read counts per bin.", + type=parserCommon.writableFile, + metavar="", + ) return parser @@ -250,7 +287,11 @@ def getSyntheticJSD(vec): chip[int(val)] += 1 input = coverage * poisson.pmf(np.arange(1, MAXLEN), lamb) if chip[-1] > 0: - print("{} bins had coverage over the maximum value of {} during synthetic JSD computation".format(chip[-1], MAXLEN)) + print( + "{} bins had coverage over the maximum value of {} during synthetic JSD computation".format( + chip[-1], MAXLEN + ) + ) return getJSDcommon(chip, input) @@ -296,9 +337,17 @@ def getJSD(args, idx, mat): if val > 0: input[int(val)] += 1 if input[-1] > 0: - print("{} bins had coverage over the maximum value of {} in the input sample".format(input[-1], MAXLEN)) + print( + "{} bins had coverage over the maximum value of {} in the input sample".format( + input[-1], MAXLEN + ) + ) if chip[-1] > 0: - print("{} bins had coverage over the maximum value of {} in the ChIP sample".format(chip[-1], MAXLEN)) + print( + "{} bins had coverage over the maximum value of {} in the ChIP sample".format( + chip[-1], MAXLEN + ) + ) return getJSDcommon(chip, input) @@ -307,14 +356,23 @@ def getJSDcommon(chip, input): """ This is a continuation of getJSD to allow getSyntheticJSD to reuse code """ + def signalAndBinDist(x): x = np.array(x) (n,) = x.shape signalValues = np.array(list(range(n))) totalSignal = x * signalValues - normalizedTotalSignal = np.cumsum(totalSignal) / np.sum(totalSignal).astype("float") + normalizedTotalSignal = np.cumsum(totalSignal) / np.sum(totalSignal).astype( + "float" + ) binDist = np.cumsum(x).astype("float") / sum(x) - interpolater = interpolate.interp1d(binDist, normalizedTotalSignal, kind='linear', bounds_error=False, fill_value=(0, 1)) + interpolater = interpolate.interp1d( + binDist, + normalizedTotalSignal, + kind="linear", + bounds_error=False, + fill_value=(0, 1), + ) return (binDist, normalizedTotalSignal, interpolater) # Interpolate the signals to evenly spaced bins, which also removes 0-coverage bins @@ -337,12 +395,16 @@ def signalAndBinDist(x): PMFchip = np.ediff1d(chipSignalInterp) if abs(sum(PMFinput) - 1) > 0.01 or abs(sum(PMFchip) - 1) > 0.01: - sys.stderr.write("Warning: At least one PMF integral is significantly different from 1! The JSD will not be returned") + sys.stderr.write( + "Warning: At least one PMF integral is significantly different from 1! The JSD will not be returned" + ) return np.NAN # Compute the JSD from the PMFs M = (PMFinput + PMFchip) / 2.0 - JSD = 0.5 * (np.nansum(PMFinput * np.log2(PMFinput / M))) + 0.5 * (np.nansum(PMFchip * np.log2(PMFchip / M))) + JSD = 0.5 * (np.nansum(PMFinput * np.log2(PMFinput / M))) + 0.5 * ( + np.nansum(PMFchip * np.log2(PMFchip / M)) + ) return np.sqrt(JSD) @@ -352,7 +414,9 @@ def getExpected(mu): Given a mean coverage mu, determine the AUC, X-intercept, and elbow point of a Poisson-distributed perfectly behaved input sample with the same coverage """ - x = np.arange(round(poisson.interval(0.99999, mu=mu)[1] + 1)) # This will be an appropriate range + x = np.arange( + round(poisson.interval(0.99999, mu=mu)[1] + 1) + ) # This will be an appropriate range pmf = poisson.pmf(x, mu=mu) cdf = poisson.cdf(x, mu=mu) cs = np.cumsum(pmf * x) @@ -367,7 +431,9 @@ def main(args=None): args = process_args(args) if not args.plotFile and not args.outRawCounts and not args.outQualityMetrics: - sys.stderr.write("\nAt least one of --plotFile, --outRawCounts or --outQualityMetrics is required.\n") + sys.stderr.write( + "\nAt least one of --plotFile, --outRawCounts or --outQualityMetrics is required.\n" + ) sys.exit(1) cr = sumR.SumCoveragePerBin( @@ -385,58 +451,84 @@ def main(args=None): samFlag_include=args.samFlagInclude, samFlag_exclude=args.samFlagExclude, minFragmentLength=args.minFragmentLength, - maxFragmentLength=args.maxFragmentLength) + maxFragmentLength=args.maxFragmentLength, + ) num_reads_per_bin = cr.run() if num_reads_per_bin.sum() == 0: import sys + sys.stderr.write( "\nNo reads were found in {} regions sampled. Check that the\n" "min mapping quality is not overly high and that the \n" "chromosome names between bam files are consistant.\n" "For small genomes, decrease the --numberOfSamples.\n" - "\n".format(num_reads_per_bin.shape[0])) + "\n".format(num_reads_per_bin.shape[0]) + ) exit(1) if args.skipZeros: num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin) total = len(num_reads_per_bin[:, 0]) - x = np.arange(total).astype('float') / total # normalize from 0 to 1 + x = np.arange(total).astype("float") / total # normalize from 0 to 1 if args.plotFile is not None: i = 0 # matplotlib won't iterate through line styles by itself pyplot_line_styles = sum([7 * ["-"], 7 * ["--"], 7 * ["-."], 7 * [":"]], []) - plotly_colors = ["#d73027", "#fc8d59", "#f33090", "#e0f3f8", "#91bfdb", "#4575b4"] - plotly_line_styles = sum([6 * ["solid"], 6 * ["dot"], 6 * ["dash"], 6 * ["longdash"], 6 * ["dashdot"], 6 * ["longdashdot"]], []) + plotly_colors = [ + "#d73027", + "#fc8d59", + "#f33090", + "#e0f3f8", + "#91bfdb", + "#4575b4", + ] + plotly_line_styles = sum( + [ + 6 * ["solid"], + 6 * ["dot"], + 6 * ["dash"], + 6 * ["longdash"], + 6 * ["dashdot"], + 6 * ["longdashdot"], + ], + [], + ) data = [] for i, reads in enumerate(num_reads_per_bin.T): count = np.cumsum(np.sort(reads)) count = count / count[-1] # to normalize y from 0 to 1 - if args.plotFileFormat == 'plotly': - trace = go.Scatter(x=x, y=count, mode='lines', name=args.labels[i]) - trace['line'].update(dash=plotly_line_styles[i % 36], color=plotly_colors[i % 6]) + if args.plotFileFormat == "plotly": + trace = go.Scatter(x=x, y=count, mode="lines", name=args.labels[i]) + trace["line"].update( + dash=plotly_line_styles[i % 36], color=plotly_colors[i % 6] + ) data.append(trace) else: j = i % len(pyplot_line_styles) - plt.plot(x, count, label=args.labels[i], linestyle=pyplot_line_styles[j]) - plt.xlabel('rank') - plt.ylabel('fraction w.r.t. bin with highest coverage') + plt.plot( + x, count, label=args.labels[i], linestyle=pyplot_line_styles[j] + ) + plt.xlabel("rank") + plt.ylabel("fraction w.r.t. bin with highest coverage") # set the plotFileFormat explicitly to None to trigger the # format from the file-extension if not args.plotFileFormat: args.plotFileFormat = None - if args.plotFileFormat == 'plotly': + if args.plotFileFormat == "plotly": fig = go.Figure() fig.add_traces(data) - fig['layout'].update(title=args.plotTitle) - fig['layout']['xaxis1'].update(title="rank") - fig['layout']['yaxis1'].update(title="fraction w.r.t bin with highest coverage") + fig["layout"].update(title=args.plotTitle) + fig["layout"]["xaxis1"].update(title="rank") + fig["layout"]["yaxis1"].update( + title="fraction w.r.t bin with highest coverage" + ) py.plot(fig, filename=args.plotFile, auto_open=False) else: - plt.legend(loc='upper left') + plt.legend(loc="upper left") plt.suptitle(args.plotTitle) plt.savefig(args.plotFile, bbox_inches=0, format=args.plotFileFormat) plt.close() @@ -445,33 +537,55 @@ def main(args=None): of = open(args.outRawCounts, "w") of.write("#plotFingerprint --outRawCounts\n") of.write("'" + "'\t'".join(args.labels) + "'\n") - fmt = "\t".join(np.repeat('%d', num_reads_per_bin.shape[1])) + "\n" + fmt = "\t".join(np.repeat("%d", num_reads_per_bin.shape[1])) + "\n" for row in num_reads_per_bin: of.write(fmt % tuple(row)) of.close() if args.outQualityMetrics is not None: of = open(args.outQualityMetrics, "w") - of.write("Sample\tAUC\tSynthetic AUC\tX-intercept\tSynthetic X-intercept\tElbow Point\tSynthetic Elbow Point") + of.write( + "Sample\tAUC\tSynthetic AUC\tX-intercept\tSynthetic X-intercept\tElbow Point\tSynthetic Elbow Point" + ) if args.JSDsample: - of.write("\tJS Distance\tSynthetic JS Distance\t% genome enriched\tdiff. enrichment\tCHANCE divergence") + of.write( + "\tJS Distance\tSynthetic JS Distance\t% genome enriched\tdiff. enrichment\tCHANCE divergence" + ) else: of.write("\tSynthetic JS Distance") of.write("\n") - line = np.arange(num_reads_per_bin.shape[0]) / float(num_reads_per_bin.shape[0] - 1) + line = np.arange(num_reads_per_bin.shape[0]) / float( + num_reads_per_bin.shape[0] - 1 + ) for idx, reads in enumerate(num_reads_per_bin.T): counts = np.cumsum(np.sort(reads)) counts = counts / float(counts[-1]) AUC = np.sum(counts) / float(len(counts)) XInt = (np.argmax(counts > 0) + 1) / float(counts.shape[0]) elbow = (np.argmax(line - counts) + 1) / float(counts.shape[0]) - expected = getExpected(np.mean(reads)) # A tuple of expected (AUC, XInt, elbow) - of.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format(args.labels[idx], AUC, expected[0], XInt, expected[1], elbow, expected[2])) + expected = getExpected( + np.mean(reads) + ) # A tuple of expected (AUC, XInt, elbow) + of.write( + "{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}".format( + args.labels[idx], + AUC, + expected[0], + XInt, + expected[1], + elbow, + expected[2], + ) + ) if args.JSDsample: JSD = getJSD(args, idx, num_reads_per_bin) syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx]) CHANCE = getCHANCE(args, idx, num_reads_per_bin) - of.write("\t{0}\t{1}\t{2}\t{3}\t{4}".format(JSD, syntheticJSD, CHANCE[0], CHANCE[1], CHANCE[2])) + of.write( + "\t{0}\t{1}\t{2}\t{3}\t{4}".format( + JSD, syntheticJSD, CHANCE[0], CHANCE[1], CHANCE[2] + ) + ) else: syntheticJSD = getSyntheticJSD(num_reads_per_bin[:, idx]) of.write("\t{0}".format(syntheticJSD)) diff --git a/deeptools/plotHeatmap.py b/deeptools/plotHeatmap.py index 035839098..a928c38d4 100755 --- a/deeptools/plotHeatmap.py +++ b/deeptools/plotHeatmap.py @@ -6,9 +6,10 @@ from collections import OrderedDict import numpy as np import matplotlib -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['svg.fonttype'] = 'none' + +matplotlib.use("Agg") +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["svg.fonttype"] = "none" import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties import matplotlib.gridspec as gridspec @@ -27,22 +28,25 @@ from deeptools.computeMatrixOperations import filterHeatmapValues debug = 0 -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") plt.ioff() def parse_arguments(args=None): parser = argparse.ArgumentParser( - parents=[parserCommon.heatmapperMatrixArgs(), - parserCommon.heatmapperOutputArgs(mode='heatmap'), - parserCommon.heatmapperOptionalArgs(mode='heatmap')], + parents=[ + parserCommon.heatmapperMatrixArgs(), + parserCommon.heatmapperOutputArgs(mode="heatmap"), + parserCommon.heatmapperOptionalArgs(mode="heatmap"), + ], formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='This tool creates a heatmap for ' - 'scores associated with genomic regions. ' - 'The program requires a matrix file ' - 'generated by the tool ``computeMatrix``.', - epilog='An example usage is: plotHeatmap -m ', - add_help=False) + description="This tool creates a heatmap for " + "scores associated with genomic regions. " + "The program requires a matrix file " + "generated by the tool ``computeMatrix``.", + epilog="An example usage is: plotHeatmap -m ", + add_help=False, + ) return parser @@ -50,17 +54,27 @@ def parse_arguments(args=None): def process_args(args=None): args = parse_arguments().parse_args(args) - args.heatmapHeight = args.heatmapHeight if args.heatmapHeight > 3 and args.heatmapHeight <= 100 else 10 + args.heatmapHeight = ( + args.heatmapHeight + if args.heatmapHeight > 3 and args.heatmapHeight <= 100 + else 10 + ) if not matplotlib.colors.is_color_like(args.missingDataColor): - exit("The value {0} for --missingDataColor is not valid".format(args.missingDataColor)) + exit( + "The value {0} for --missingDataColor is not valid".format( + args.missingDataColor + ) + ) - args.boxAroundHeatmaps = True if args.boxAroundHeatmaps == 'yes' else False + args.boxAroundHeatmaps = True if args.boxAroundHeatmaps == "yes" else False return args -def prepare_layout(hm_matrix, heatmapsize, showSummaryPlot, showColorbar, perGroup, colorbar_position): +def prepare_layout( + hm_matrix, heatmapsize, showSummaryPlot, showColorbar, perGroup, colorbar_position +): """ prepare the plot layout as a grid having as many rows @@ -78,7 +92,9 @@ def prepare_layout(hm_matrix, heatmapsize, showSummaryPlot, showColorbar, perGro # on the number of regions contained in the if perGroup: # heatmap - height_ratio = np.array([np.amax(np.diff(hm_matrix.group_boundaries))] * numrows) + height_ratio = np.array( + [np.amax(np.diff(hm_matrix.group_boundaries))] * numrows + ) # scale ratio to sum = heatmapheight height_ratio = heatmapheight * (height_ratio.astype(float) / height_ratio.sum()) else: @@ -93,7 +109,7 @@ def prepare_layout(hm_matrix, heatmapsize, showSummaryPlot, showColorbar, perGro width_ratio = [heatmapwidth] * numcols if showColorbar: - if colorbar_position == 'below': + if colorbar_position == "below": numrows += 2 # a spacer needs to be added to avoid overlaps height_ratio += [4 / 2.54] # spacer height_ratio += [1 / 2.54] @@ -111,12 +127,15 @@ def prepare_layout(hm_matrix, heatmapsize, showSummaryPlot, showColorbar, perGro # numbers to heatmapheigt fractions height_ratio = np.concatenate([[sumplot_height, spacer_height], height_ratio]) - grids = gridspec.GridSpec(numrows, numcols, height_ratios=height_ratio, width_ratios=width_ratio) + grids = gridspec.GridSpec( + numrows, numcols, height_ratios=height_ratio, width_ratios=width_ratio + ) return grids -def autobreaklinetitle(title,sep="[-_,.]",lmax=15): - sss = [ rr for rr in re.split(sep,title) if len(rr) ] + +def autobreaklinetitle(title, sep="[-_,.]", lmax=15): + sss = [rr for rr in re.split(sep, title) if len(rr)] newtitle, tmp = "", "" for ss in sss: tmp += ss @@ -128,15 +147,38 @@ def autobreaklinetitle(title,sep="[-_,.]",lmax=15): newtitle += tmp.strip("-") + "\n" return newtitle -def addProfilePlot(hm, plt, fig, grids, iterNum, iterNum2, perGroup, averageType, plot_type, yAxisLabel, color_list, yMin, yMax, wspace, hspace, colorbar_position, label_rotation=0.0): + +def addProfilePlot( + hm, + plt, + fig, + grids, + iterNum, + iterNum2, + perGroup, + averageType, + plot_type, + yAxisLabel, + color_list, + yMin, + yMax, + wspace, + hspace, + colorbar_position, + label_rotation=0.0, +): """ A function to add profile plots to the given figure, possibly in a custom grid subplot which mimics a tight layout (if wspace and hspace are not None) """ if wspace is not None and hspace is not None: - if colorbar_position == 'side': - gridsSub = gridspec.GridSpecFromSubplotSpec(1, iterNum, subplot_spec=grids[0, :-1], wspace=wspace, hspace=hspace) + if colorbar_position == "side": + gridsSub = gridspec.GridSpecFromSubplotSpec( + 1, iterNum, subplot_spec=grids[0, :-1], wspace=wspace, hspace=hspace + ) else: - gridsSub = gridspec.GridSpecFromSubplotSpec(1, iterNum, subplot_spec=grids[0, :], wspace=wspace, hspace=hspace) + gridsSub = gridspec.GridSpecFromSubplotSpec( + 1, iterNum, subplot_spec=grids[0, :], wspace=wspace, hspace=hspace + ) ax_list = [] globalYmin = np.inf @@ -160,25 +202,28 @@ def addProfilePlot(hm, plt, fig, grids, iterNum, iterNum2, perGroup, averageType for group in range(iterNum2): if perGroup: sub_matrix = hm.matrix.get_matrix(sample_id, group) - line_label = sub_matrix['sample'] + line_label = sub_matrix["sample"] else: sub_matrix = hm.matrix.get_matrix(group, sample_id) - line_label = sub_matrix['group'] - plot_single(ax_profile, sub_matrix['matrix'], - averageType, - color_list[group], - line_label, - plot_type=plot_type) + line_label = sub_matrix["group"] + plot_single( + ax_profile, + sub_matrix["matrix"], + averageType, + color_list[group], + line_label, + plot_type=plot_type, + ) if sample_id > 0 and len(yMin) == 1 and len(yMax) == 1: plt.setp(ax_profile.get_yticklabels(), visible=False) - ax_profile.get_yaxis().set_tick_params(direction='in',pad=-22) # beisi - if sample_id == 0 and yAxisLabel != '': + ax_profile.get_yaxis().set_tick_params(direction="in", pad=-22) # beisi + if sample_id == 0 and yAxisLabel != "": ax_profile.set_ylabel(yAxisLabel) xticks, xtickslabel = hm.getTicks(tickIdx) - if np.ceil(max(xticks)) != float(sub_matrix['matrix'].shape[1] - 1): - tickscale = float(sub_matrix['matrix'].shape[1] - 1) / max(xticks) + if np.ceil(max(xticks)) != float(sub_matrix["matrix"].shape[1] - 1): + tickscale = float(sub_matrix["matrix"].shape[1] - 1) / max(xticks) xticks_use = [x * tickscale for x in xticks] ax_profile.axes.set_xticks(xticks_use) else: @@ -190,8 +235,8 @@ def addProfilePlot(hm, plt, fig, grids, iterNum, iterNum2, perGroup, averageType # such that they don't fall off # the heatmap sides ticks = ax_profile.xaxis.get_major_ticks() - ticks[0].label1.set_horizontalalignment('left') - ticks[-1].label1.set_horizontalalignment('right') + ticks[0].label1.set_horizontalalignment("left") + ticks[-1].label1.set_horizontalalignment("right") globalYmin = min(np.float64(globalYmin), ax_profile.get_ylim()[0]) globalYmax = max(globalYmax, ax_profile.get_ylim()[1]) @@ -214,20 +259,30 @@ def addProfilePlot(hm, plt, fig, grids, iterNum, iterNum2, perGroup, averageType return ax_list -def plotlyMatrix(hm, - outFilename, - yMin=[None], yMax=[None], - zMin=[None], zMax=[None], - showSummaryPlot=False, - cmap=None, colorList=None, colorBarPosition='side', - perGroup=False, - averageType='median', yAxisLabel='', xAxisLabel='', - plotTitle='', - showColorbar=False, - label_rotation=0.0): +def plotlyMatrix( + hm, + outFilename, + yMin=[None], + yMax=[None], + zMin=[None], + zMax=[None], + showSummaryPlot=False, + cmap=None, + colorList=None, + colorBarPosition="side", + perGroup=False, + averageType="median", + yAxisLabel="", + xAxisLabel="", + plotTitle="", + showColorbar=False, + label_rotation=0.0, +): label_rotation *= -1.0 - if colorBarPosition != 'side': - sys.error.write("Warning: It is not currently possible to have multiple colorbars with plotly!\n") + if colorBarPosition != "side": + sys.error.write( + "Warning: It is not currently possible to have multiple colorbars with plotly!\n" + ) nRows = hm.matrix.get_num_groups() nCols = hm.matrix.get_num_samples() @@ -239,8 +294,8 @@ def plotlyMatrix(hm, if showSummaryPlot: profileHeight = 0.2 profileBottomBuffer = 0.05 - profileSideBuffer = 0. - profileWidth = 1. / nCols + profileSideBuffer = 0.0 + profileWidth = 1.0 / nCols if nCols > 1: profileSideBuffer = 0.1 / (nCols - 1) profileWidth = 0.9 / nCols @@ -248,7 +303,7 @@ def plotlyMatrix(hm, dataSummary = [] annos = [] fig = go.Figure() - fig['layout'].update(title=plotTitle) + fig["layout"].update(title=plotTitle) xAxisN = 1 yAxisN = 1 @@ -257,8 +312,8 @@ def plotlyMatrix(hm, yMinLocal = np.inf yMaxLocal = -np.inf for i in range(nCols): - xanchor = 'x{}'.format(xAxisN) - yanchor = 'y{}'.format(yAxisN) + xanchor = "x{}".format(xAxisN) + yanchor = "y{}".format(yAxisN) xBase = i * (profileSideBuffer + profileWidth) yBase = 1 - profileHeight xDomain = [xBase, xBase + profileWidth] @@ -267,20 +322,32 @@ def plotlyMatrix(hm, if perGroup: mat = hm.matrix.get_matrix(i, j) xTicks, xTicksLabels = hm.getTicks(i) - label = mat['sample'] + label = mat["sample"] else: mat = hm.matrix.get_matrix(j, i) xTicks, xTicksLabels = hm.getTicks(j) - label = mat['group'] + label = mat["group"] if j == 0: - fig['layout']['xaxis{}'.format(xAxisN)] = dict(domain=xDomain, anchor=yanchor, range=[0, mat['matrix'].shape[1]], tickmode='array', tickvals=xTicks, ticktext=xTicksLabels, tickangle=label_rotation) - fig['layout']['yaxis{}'.format(yAxisN)] = dict(anchor=xanchor, domain=yDomain) - trace = plotly_single(mat['matrix'], averageType, colorList[j], label)[0] + fig["layout"]["xaxis{}".format(xAxisN)] = dict( + domain=xDomain, + anchor=yanchor, + range=[0, mat["matrix"].shape[1]], + tickmode="array", + tickvals=xTicks, + ticktext=xTicksLabels, + tickangle=label_rotation, + ) + fig["layout"]["yaxis{}".format(yAxisN)] = dict( + anchor=xanchor, domain=yDomain + ) + trace = plotly_single(mat["matrix"], averageType, colorList[j], label)[ + 0 + ] trace.update(xaxis=xanchor, yaxis=yanchor, legendgroup=label) - if min(trace['y']) < yMinLocal: - yMinLocal = min(trace['y']) - if max(trace['y']) > yMaxLocal: - yMaxLocal = max(trace['y']) + if min(trace["y"]) < yMinLocal: + yMinLocal = min(trace["y"]) + if max(trace["y"]) > yMaxLocal: + yMaxLocal = max(trace["y"]) if i == 0: trace.update(showlegend=True) dataSummary.append(trace) @@ -291,7 +358,19 @@ def plotlyMatrix(hm, else: title = hm.matrix.sample_labels[i] titleX = xBase + 0.5 * profileWidth - annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': title, 'y': 1.0, 'x': titleX, 'font': {'size': 16}, 'showarrow': False}) + annos.append( + { + "yanchor": "bottom", + "xref": "paper", + "xanchor": "center", + "yref": "paper", + "text": title, + "y": 1.0, + "x": titleX, + "font": {"size": 16}, + "showarrow": False, + } + ) xAxisN += 1 yAxisN += 1 @@ -303,22 +382,22 @@ def plotlyMatrix(hm, yMaxUse = yMaxLocal if yMax[(i - 1) % len(yMax)] is not None: yMaxUse = yMax[(i - 1) % len(yMax)] - fig['layout']['yaxis{}'.format(i)].update(range=[yMinUse, yMaxUse]) - fig['layout']['yaxis1'].update(title=yAxisLabel) + fig["layout"]["yaxis{}".format(i)].update(range=[yMinUse, yMaxUse]) + fig["layout"]["yaxis1"].update(title=yAxisLabel) # Add the heatmap dataHeatmap = [] zMinLocal = np.inf zMaxLocal = -np.inf - heatmapWidth = 1. / nCols + heatmapWidth = 1.0 / nCols heatmapSideBuffer = 0.0 if nCols > 1: - heatmapWidth = .9 / nCols + heatmapWidth = 0.9 / nCols heatmapSideBuffer = 0.1 / (nCols - 1) heatmapHeight = 1.0 - profileHeight - profileBottomBuffer for i in range(nCols): - xanchor = 'x{}'.format(xAxisN) + xanchor = "x{}".format(xAxisN) xBase = i * (heatmapSideBuffer + heatmapWidth) # Determine the height of each heatmap, they have no buffer @@ -328,50 +407,78 @@ def plotlyMatrix(hm, mat = hm.matrix.get_matrix(i, j) else: mat = hm.matrix.get_matrix(j, i) - lengths.append(mat['matrix'].shape[0]) - fractionalHeights = heatmapHeight * np.cumsum(lengths).astype(float) / np.sum(lengths).astype(float) + lengths.append(mat["matrix"].shape[0]) + fractionalHeights = ( + heatmapHeight + * np.cumsum(lengths).astype(float) + / np.sum(lengths).astype(float) + ) xDomain = [xBase, xBase + heatmapWidth] - fig['layout']['xaxis{}'.format(xAxisN)] = dict(domain=xDomain, anchor='free', position=0.0, range=[0, mat['matrix'].shape[1]], tickmode='array', tickvals=xTicks, ticktext=xTicksLabels, title=xAxisLabel) + fig["layout"]["xaxis{}".format(xAxisN)] = dict( + domain=xDomain, + anchor="free", + position=0.0, + range=[0, mat["matrix"].shape[1]], + tickmode="array", + tickvals=xTicks, + ticktext=xTicksLabels, + title=xAxisLabel, + ) # Start adding the heatmaps for j in range(nRows): if perGroup: mat = hm.matrix.get_matrix(i, j) - label = mat['sample'] + label = mat["sample"] start = hm.matrix.group_boundaries[i] end = hm.matrix.group_boundaries[i + 1] else: mat = hm.matrix.get_matrix(j, i) - label = mat['group'] + label = mat["group"] start = hm.matrix.group_boundaries[j] end = hm.matrix.group_boundaries[j + 1] regs = hm.matrix.regions[start:end] regs = [x[2] for x in regs] - yanchor = 'y{}'.format(yAxisN) - yDomain = [heatmapHeight - fractionalHeights[j + 1], heatmapHeight - fractionalHeights[j]] + yanchor = "y{}".format(yAxisN) + yDomain = [ + heatmapHeight - fractionalHeights[j + 1], + heatmapHeight - fractionalHeights[j], + ] visible = False if i == 0: visible = True - fig['layout']['yaxis{}'.format(yAxisN)] = dict(domain=yDomain, anchor=xanchor, visible=visible, title=label, tickmode='array', tickvals=[], ticktext=[]) - if np.min(mat['matrix']) < zMinLocal: - zMinLocal = np.min(mat['matrix']) - if np.max(mat['matrix']) < zMaxLocal: - zMaxLocal = np.max(mat['matrix']) - - trace = go.Heatmap(z=np.flipud(mat['matrix']), - y=regs[::-1], - xaxis=xanchor, - yaxis=yanchor, - showlegend=False, - name=label, - showscale=False) + fig["layout"]["yaxis{}".format(yAxisN)] = dict( + domain=yDomain, + anchor=xanchor, + visible=visible, + title=label, + tickmode="array", + tickvals=[], + ticktext=[], + ) + if np.min(mat["matrix"]) < zMinLocal: + zMinLocal = np.min(mat["matrix"]) + if np.max(mat["matrix"]) < zMaxLocal: + zMaxLocal = np.max(mat["matrix"]) + + trace = go.Heatmap( + z=np.flipud(mat["matrix"]), + y=regs[::-1], + xaxis=xanchor, + yaxis=yanchor, + showlegend=False, + name=label, + showscale=False, + ) dataHeatmap.append(trace) yAxisN += 1 xAxisN += 1 if showColorbar: dataHeatmap[-1].update(showscale=True) - dataHeatmap[-1]['colorbar'].update(len=heatmapHeight, y=0, yanchor='bottom', ypad=0.0) + dataHeatmap[-1]["colorbar"].update( + len=heatmapHeight, y=0, yanchor="bottom", ypad=0.0 + ) # Adjust z bounds and colorscale for trace in dataHeatmap: @@ -381,36 +488,48 @@ def plotlyMatrix(hm, zMinUse = zMin[0] if zMax[0] is not None: zMaxUse = zMax[0] - trace.update(zmin=zMinUse, zmax=zMaxUse, colorscale=convertCmap(cmap[0], vmin=zMinUse, vmax=zMaxUse)) + trace.update( + zmin=zMinUse, + zmax=zMaxUse, + colorscale=convertCmap(cmap[0], vmin=zMinUse, vmax=zMaxUse), + ) dataSummary.extend(dataHeatmap) fig.add_traces(dataSummary) - fig['layout']['annotations'] = annos + fig["layout"]["annotations"] = annos py.plot(fig, filename=outFilename, auto_open=False) -def plotMatrix(hm, outFileName, - colorMapDict={'colorMap': ['binary'], 'missingDataColor': 'black', 'alpha': 1.0}, - plotTitle='', - xAxisLabel='', yAxisLabel='', regionsLabel='', - zMin=None, zMax=None, - yMin=None, yMax=None, - averageType='median', - reference_point_label=None, - startLabel='TSS', endLabel="TES", - heatmapHeight=25, - heatmapWidth=7.5, - perGroup=False, whatToShow='plot, heatmap and colorbar', - plot_type='lines', - linesAtTickMarks=False, - image_format=None, - legend_location='upper-left', - box_around_heatmaps=True, - label_rotation=0.0, - dpi=200, - interpolation_method='auto'): - - hm.reference_point_label = hm.parameters['ref point'] +def plotMatrix( + hm, + outFileName, + colorMapDict={"colorMap": ["binary"], "missingDataColor": "black", "alpha": 1.0}, + plotTitle="", + xAxisLabel="", + yAxisLabel="", + regionsLabel="", + zMin=None, + zMax=None, + yMin=None, + yMax=None, + averageType="median", + reference_point_label=None, + startLabel="TSS", + endLabel="TES", + heatmapHeight=25, + heatmapWidth=7.5, + perGroup=False, + whatToShow="plot, heatmap and colorbar", + plot_type="lines", + linesAtTickMarks=False, + image_format=None, + legend_location="upper-left", + box_around_heatmaps=True, + label_rotation=0.0, + dpi=200, + interpolation_method="auto", +): + hm.reference_point_label = hm.parameters["ref point"] if reference_point_label is not None: hm.reference_point_label = [reference_point_label] * hm.matrix.get_num_samples() hm.startLabel = startLabel @@ -425,12 +544,12 @@ def plotMatrix(hm, outFileName, zMin = [None] else: zMin = [zMin] # convert to list to support multiple entries - elif 'auto' in zMin: + elif "auto" in zMin: matrix_flatten = hm.matrix.flatten() auto_min = np.percentile(matrix_flatten, 1.0) if np.isnan(auto_min): auto_min = None - new_mins = [float(x) if x != 'auto' else auto_min for x in zMin] + new_mins = [float(x) if x != "auto" else auto_min for x in zMin] zMin = new_mins else: new_mins = [float(x) for x in zMin] @@ -445,12 +564,12 @@ def plotMatrix(hm, outFileName, zMax = [None] else: zMax = [zMax] - elif 'auto' in zMax: + elif "auto" in zMax: matrix_flatten = hm.matrix.flatten() auto_max = np.percentile(matrix_flatten, 98.0) if np.isnan(auto_max): auto_max = None - new_maxs = [float(x) if x != 'auto' else auto_max for x in zMax] + new_maxs = [float(x) if x != "auto" else auto_max for x in zMax] zMax = new_maxs else: new_maxs = [float(x) for x in zMax] @@ -458,9 +577,11 @@ def plotMatrix(hm, outFileName, if (len(zMin) > 1) & (len(zMax) > 1): for index, value in enumerate(zMax): if value <= zMin[index]: - sys.stderr.write("Warnirng: In bigwig {}, the given zmin ({}) is larger than " - "or equal to the given zmax ({}). Thus, it has been set " - "to None. \n".format(index + 1, zMin[index], value)) + sys.stderr.write( + "Warnirng: In bigwig {}, the given zmin ({}) is larger than " + "or equal to the given zmax ({}). Thus, it has been set " + "to None. \n".format(index + 1, zMin[index], value) + ) zMin[index] = None if yMin is None: @@ -472,45 +593,60 @@ def plotMatrix(hm, outFileName, if not isinstance(yMax, list): yMax = [yMax] - plt.rcParams['font.size'] = 8.0 + plt.rcParams["font.size"] = 8.0 fontP = FontProperties() showSummaryPlot = False showColorbar = False - if whatToShow == 'plot and heatmap': + if whatToShow == "plot and heatmap": showSummaryPlot = True - elif whatToShow == 'heatmap and colorbar': + elif whatToShow == "heatmap and colorbar": showColorbar = True - elif whatToShow == 'plot, heatmap and colorbar': + elif whatToShow == "plot, heatmap and colorbar": showSummaryPlot = True showColorbar = True # colormap for the heatmap - if colorMapDict['colorMap']: + if colorMapDict["colorMap"]: cmap = [] - for color_map in colorMapDict['colorMap']: + for color_map in colorMapDict["colorMap"]: copy_cmp = copy.copy(plt.get_cmap(color_map)) cmap.append(copy_cmp) - cmap[-1].set_bad(colorMapDict['missingDataColor']) # nans are printed using this color + cmap[-1].set_bad( + colorMapDict["missingDataColor"] + ) # nans are printed using this color - if colorMapDict['colorList'] and len(colorMapDict['colorList']) > 0: + if colorMapDict["colorList"] and len(colorMapDict["colorList"]) > 0: # make a cmap for each color list given cmap = [] - for color_list in colorMapDict['colorList']: - cmap.append(matplotlib.colors.LinearSegmentedColormap.from_list( - 'my_cmap', color_list.replace(' ', '').split(","), N=colorMapDict['colorNumber'])) - cmap[-1].set_bad(colorMapDict['missingDataColor']) # nans are printed using this color + for color_list in colorMapDict["colorList"]: + cmap.append( + matplotlib.colors.LinearSegmentedColormap.from_list( + "my_cmap", + color_list.replace(" ", "").split(","), + N=colorMapDict["colorNumber"], + ) + ) + cmap[-1].set_bad( + colorMapDict["missingDataColor"] + ) # nans are printed using this color if len(cmap) > 1 or len(zMin) > 1 or len(zMax) > 1: # position color bar below heatmap when more than one # heatmap color is given - colorbar_position = 'below' + colorbar_position = "below" else: - colorbar_position = 'side' + colorbar_position = "side" - grids = prepare_layout(hm.matrix, (heatmapWidth, heatmapHeight), - showSummaryPlot, showColorbar, perGroup, colorbar_position) + grids = prepare_layout( + hm.matrix, + (heatmapWidth, heatmapHeight), + showSummaryPlot, + showColorbar, + perGroup, + colorbar_position, + ) # figsize: w,h tuple in inches figwidth = heatmapWidth / 2.54 @@ -527,7 +663,7 @@ def plotMatrix(hm, outFileName, num_cols = numsamples total_figwidth = figwidth * num_cols if showColorbar: - if colorbar_position == 'below': + if colorbar_position == "below": figheight += 1 / 2.54 else: total_figwidth += 1 / 2.54 @@ -536,32 +672,43 @@ def plotMatrix(hm, outFileName, fig.suptitle(plotTitle, y=1 - (0.06 / figheight)) # color map for the summary plot (profile) on top of the heatmap - cmap_plot = plt.get_cmap('jet') + cmap_plot = plt.get_cmap("jet") numgroups = hm.matrix.get_num_groups() if perGroup: - color_list = cmap_plot(np.arange(hm.matrix.get_num_samples()) / hm.matrix.get_num_samples()) + color_list = cmap_plot( + np.arange(hm.matrix.get_num_samples()) / hm.matrix.get_num_samples() + ) else: color_list = cmap_plot(np.arange(numgroups) / numgroups) - alpha = colorMapDict['alpha'] - if image_format == 'plotly': - return plotlyMatrix(hm, - outFileName, - yMin=yMin, yMax=yMax, - zMin=zMin, zMax=zMax, - showSummaryPlot=showSummaryPlot, showColorbar=showColorbar, - cmap=cmap, colorList=color_list, colorBarPosition=colorbar_position, - perGroup=perGroup, - averageType=averageType, plotTitle=plotTitle, - xAxisLabel=xAxisLabel, yAxisLabel=yAxisLabel, - label_rotation=label_rotation) + alpha = colorMapDict["alpha"] + if image_format == "plotly": + return plotlyMatrix( + hm, + outFileName, + yMin=yMin, + yMax=yMax, + zMin=zMin, + zMax=zMax, + showSummaryPlot=showSummaryPlot, + showColorbar=showColorbar, + cmap=cmap, + colorList=color_list, + colorBarPosition=colorbar_position, + perGroup=perGroup, + averageType=averageType, + plotTitle=plotTitle, + xAxisLabel=xAxisLabel, + yAxisLabel=yAxisLabel, + label_rotation=label_rotation, + ) # check if matrix is reference-point based using the upstream >0 value # and is sorted by region length. If this is # the case, prepare the data to plot a border at the regions end - regions_length_in_bins = [None] * len(hm.parameters['upstream']) - if hm.matrix.sort_using == 'region_length' and hm.matrix.sort_method != 'no': - for idx in range(len(hm.parameters['upstream'])): - if hm.parameters['ref point'][idx] is None: + regions_length_in_bins = [None] * len(hm.parameters["upstream"]) + if hm.matrix.sort_using == "region_length" and hm.matrix.sort_method != "no": + for idx in range(len(hm.parameters["upstream"])): + if hm.parameters["ref point"][idx] is None: regions_length_in_bins[idx] = None continue @@ -571,16 +718,25 @@ def plotMatrix(hm, outFileName, _reg_len = [] for ind_reg in _group: if isinstance(ind_reg, dict): - _len = ind_reg['end'] - ind_reg['start'] + _len = ind_reg["end"] - ind_reg["start"] else: _len = sum([x[1] - x[0] for x in ind_reg[1]]) - if hm.parameters['ref point'][idx] == 'TSS': - _reg_len.append((hm.parameters['upstream'][idx] + _len) / hm.parameters['bin size'][idx]) - elif hm.parameters['ref point'][idx] == 'center': + if hm.parameters["ref point"][idx] == "TSS": + _reg_len.append( + (hm.parameters["upstream"][idx] + _len) + / hm.parameters["bin size"][idx] + ) + elif hm.parameters["ref point"][idx] == "center": _len *= 0.5 - _reg_len.append((hm.parameters['upstream'][idx] + _len) / hm.parameters['bin size'][idx]) - elif hm.parameters['ref point'][idx] == 'TES': - _reg_len.append((hm.parameters['upstream'][idx] - _len) / hm.parameters['bin size'][idx]) + _reg_len.append( + (hm.parameters["upstream"][idx] + _len) + / hm.parameters["bin size"][idx] + ) + elif hm.parameters["ref point"][idx] == "TES": + _reg_len.append( + (hm.parameters["upstream"][idx] - _len) + / hm.parameters["bin size"][idx] + ) foo.append(_reg_len) regions_length_in_bins[idx] = foo @@ -592,24 +748,73 @@ def plotMatrix(hm, outFileName, else: iterNum = hm.matrix.get_num_samples() iterNum2 = numgroups - ax_list = addProfilePlot(hm, plt, fig, grids, iterNum, iterNum2, perGroup, averageType, plot_type, yAxisLabel, color_list, yMin, yMax, None, None, colorbar_position, label_rotation) + ax_list = addProfilePlot( + hm, + plt, + fig, + grids, + iterNum, + iterNum2, + perGroup, + averageType, + plot_type, + yAxisLabel, + color_list, + yMin, + yMax, + None, + None, + colorbar_position, + label_rotation, + ) if len(yMin) > 1 or len(yMax) > 1: # replot with a tight layout import matplotlib.tight_layout as tl + specList = tl.get_subplotspec_list(fig.axes, grid_spec=grids) renderer = tl.get_renderer(fig) - kwargs = tl.get_tight_layout_figure(fig, fig.axes, specList, renderer, pad=1.08) + kwargs = tl.get_tight_layout_figure( + fig, fig.axes, specList, renderer, pad=1.08 + ) for ax in ax_list: fig.delaxes(ax) - ax_list = addProfilePlot(hm, plt, fig, grids, iterNum, iterNum2, perGroup, averageType, plot_type, yAxisLabel, color_list, yMin, yMax, kwargs['wspace'], kwargs['hspace'], colorbar_position, label_rotation) - - if legend_location != 'none': - ax = ax_list[-1] # beisi + ax_list = addProfilePlot( + hm, + plt, + fig, + grids, + iterNum, + iterNum2, + perGroup, + averageType, + plot_type, + yAxisLabel, + color_list, + yMin, + yMax, + kwargs["wspace"], + kwargs["hspace"], + colorbar_position, + label_rotation, + ) + + if legend_location != "none": + ax = ax_list[-1] # beisi box = ax.get_position() - ax.set_position([box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9]) - legend = ax.legend(loc='lower right', shadow=False, fontsize='x-large', bbox_to_anchor=(0, 1.3, 1, .22), ncol=10, frameon=False, prop=fontP) # beisi, legend line + ax.set_position( + [box.x0, box.y0 - box.height * 0.1, box.width, box.height * 0.9] + ) + legend = ax.legend( + loc="lower right", + shadow=False, + fontsize="x-large", + bbox_to_anchor=(0, 1.3, 1, 0.22), + ncol=10, + frameon=False, + prop=fontP, + ) # beisi, legend line ax.add_artist(legend) # ax_list[-1].legend(loc=legend_location.replace('-', ' '), ncol=1, prop=fontP, # frameon=False, markerscale=0.5) @@ -652,33 +857,37 @@ def plotMatrix(hm, outFileName, if box_around_heatmaps is False: # Turn off the boxes around the individual heatmaps - ax.spines['top'].set_visible(False) - ax.spines['right'].set_visible(False) - ax.spines['bottom'].set_visible(False) - ax.spines['left'].set_visible(False) - rows, cols = sub_matrix['matrix'].shape + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + ax.spines["bottom"].set_visible(False) + ax.spines["left"].set_visible(False) + rows, cols = sub_matrix["matrix"].shape # if the number of rows is too large, then the 'nearest' method simply # drops rows. A better solution is to relate the threshold to the DPI of the image - if interpolation_method == 'auto': + if interpolation_method == "auto": if rows >= 1000: - interpolation_method = 'bilinear' + interpolation_method = "bilinear" else: - interpolation_method = 'nearest' + interpolation_method = "nearest" # if np.clip is not used, then values of the matrix that exceed the zmax limit are # highlighted. Usually, a significant amount of pixels are equal or above the zmax and # the default behaviour produces images full of large highlighted dots. # If interpolation='nearest' is used, this has no effect - sub_matrix['matrix'] = np.clip(sub_matrix['matrix'], zMin[zmin_idx], zMax[zmax_idx]) - img = ax.imshow(sub_matrix['matrix'], - aspect='auto', - interpolation=interpolation_method, - origin='upper', - vmin=zMin[zmin_idx], - vmax=zMax[zmax_idx], - cmap=cmap[cmap_idx], - alpha=alpha, - extent=[0, cols, rows, 0]) + sub_matrix["matrix"] = np.clip( + sub_matrix["matrix"], zMin[zmin_idx], zMax[zmax_idx] + ) + img = ax.imshow( + sub_matrix["matrix"], + aspect="auto", + interpolation=interpolation_method, + origin="upper", + vmin=zMin[zmin_idx], + vmax=zMax[zmax_idx], + cmap=cmap[cmap_idx], + alpha=alpha, + extent=[0, cols, rows, 0], + ) img.set_rasterized(True) # plot border at the end of the regions # if ordered by length @@ -686,14 +895,19 @@ def plotMatrix(hm, outFileName, x_lim = ax.get_xlim() y_lim = ax.get_ylim() - ax.plot(regions_length_in_bins[sample][group_idx], - np.arange(len(regions_length_in_bins[sample][group_idx])), - '--', color='black', linewidth=0.5, dashes=(3, 2)) + ax.plot( + regions_length_in_bins[sample][group_idx], + np.arange(len(regions_length_in_bins[sample][group_idx])), + "--", + color="black", + linewidth=0.5, + dashes=(3, 2), + ) ax.set_xlim(x_lim) ax.set_ylim(y_lim) if perGroup: - ax.axes.set_xlabel(sub_matrix['group']) + ax.axes.set_xlabel(sub_matrix["group"]) if sample < hm.matrix.get_num_samples() - 1: ax.axes.get_xaxis().set_visible(False) else: @@ -701,32 +915,44 @@ def plotMatrix(hm, outFileName, ax.axes.set_xlabel(xAxisLabel) ax.axes.set_yticks([]) if perGroup and group == 0: - ax.axes.set_ylabel(sub_matrix['sample'],rotation=75,labelpad=0,fontsize=15) + ax.axes.set_ylabel( + sub_matrix["sample"], rotation=75, labelpad=0, fontsize=15 + ) elif not perGroup and sample == 0: - ax.axes.set_ylabel(sub_matrix['group'],rotation=75,labelpad=0,horizontalalignment='right',fontsize=15) + ax.axes.set_ylabel( + sub_matrix["group"], + rotation=75, + labelpad=0, + horizontalalignment="right", + fontsize=15, + ) # Plot vertical lines at tick marks if desired if linesAtTickMarks: xticks_heat, xtickslabel_heat = hm.getTicks(sample) - xticks_heat = [x + 0.5 for x in xticks_heat] # There's an offset of 0.5 compared to the profile plot - if np.ceil(max(xticks_heat)) != float(sub_matrix['matrix'].shape[1]): - tickscale = float(sub_matrix['matrix'].shape[1]) / max(xticks_heat) + xticks_heat = [ + x + 0.5 for x in xticks_heat + ] # There's an offset of 0.5 compared to the profile plot + if np.ceil(max(xticks_heat)) != float(sub_matrix["matrix"].shape[1]): + tickscale = float(sub_matrix["matrix"].shape[1]) / max(xticks_heat) xticks_heat_use = [x * tickscale for x in xticks_heat] else: xticks_heat_use = xticks_heat for x in xticks_heat_use: - ax.axvline(x=x, color='black', linewidth=0.5, dashes=(3, 2)) + ax.axvline(x=x, color="black", linewidth=0.5, dashes=(3, 2)) # add labels to last block in a column - if (perGroup and sample == numsamples - 1) or \ - (not perGroup and group_idx == numgroups - 1): - + if (perGroup and sample == numsamples - 1) or ( + not perGroup and group_idx == numgroups - 1 + ): # add xticks to the bottom heatmap (last group) ax.axes.get_xaxis().set_visible(True) xticks_heat, xtickslabel_heat = hm.getTicks(sample) - xticks_heat = [x + 0.5 for x in xticks_heat] # There's an offset of 0.5 compared to the profile plot - if np.ceil(max(xticks_heat)) != float(sub_matrix['matrix'].shape[1]): - tickscale = float(sub_matrix['matrix'].shape[1]) / max(xticks_heat) + xticks_heat = [ + x + 0.5 for x in xticks_heat + ] # There's an offset of 0.5 compared to the profile plot + if np.ceil(max(xticks_heat)) != float(sub_matrix["matrix"].shape[1]): + tickscale = float(sub_matrix["matrix"].shape[1]) / max(xticks_heat) xticks_heat_use = [x * tickscale for x in xticks_heat] ax.axes.set_xticks(xticks_heat_use) else: @@ -737,15 +963,12 @@ def plotMatrix(hm, outFileName, # such that they don't fall off # the heatmap sides ticks = ax.xaxis.get_major_ticks() - ticks[0].label1.set_horizontalalignment('left') - ticks[-1].label1.set_horizontalalignment('right') + ticks[0].label1.set_horizontalalignment("left") + ticks[-1].label1.set_horizontalalignment("right") - ax.get_xaxis().set_tick_params( - which='both', - top=False, - direction='out') + ax.get_xaxis().set_tick_params(which="both", top=False, direction="out") - if showColorbar and colorbar_position == 'below': + if showColorbar and colorbar_position == "below": # draw a colormap per each heatmap below the last block if perGroup: col = group_idx @@ -753,22 +976,24 @@ def plotMatrix(hm, outFileName, col = sample ax = fig.add_subplot(grids[-1, col]) tick_locator = ticker.MaxNLocator(nbins=3) - cbar = fig.colorbar(img, cax=ax, orientation='horizontal', ticks=tick_locator) + cbar = fig.colorbar( + img, cax=ax, orientation="horizontal", ticks=tick_locator + ) labels = cbar.ax.get_xticklabels() ticks = cbar.ax.get_xticks() if ticks[0] == 0: # if the label is at the start of the colobar # move it a bit inside to avoid overlapping # with other labels - labels[0].set_horizontalalignment('left') + labels[0].set_horizontalalignment("left") if ticks[-1] == 1: # if the label is at the end of the colobar # move it a bit inside to avoid overlapping # with other labels - labels[-1].set_horizontalalignment('right') + labels[-1].set_horizontalalignment("right") # cbar.ax.set_xticklabels(labels, rotation=90) - if showColorbar and colorbar_position != 'below': + if showColorbar and colorbar_position != "below": if showSummaryPlot: # we don't want to colorbar to extend # over the profiles and spacer top rows @@ -780,12 +1005,18 @@ def plotMatrix(hm, outFileName, fig.colorbar(img, cax=ax) if box_around_heatmaps: - plt.subplots_adjust(wspace=0.10, hspace=0.025, top=0.85, bottom=0, left=0.04, right=0.96) + plt.subplots_adjust( + wspace=0.10, hspace=0.025, top=0.85, bottom=0, left=0.04, right=0.96 + ) else: # When no box is plotted the space between heatmaps is reduced - plt.subplots_adjust(wspace=0.05, hspace=0.01, top=0.85, bottom=0, left=0.04, right=0.96) + plt.subplots_adjust( + wspace=0.05, hspace=0.01, top=0.85, bottom=0, left=0.04, right=0.96 + ) - plt.savefig(outFileName, bbox_inches='tight', pad_inches=0.1, dpi=dpi, format=image_format) + plt.savefig( + outFileName, bbox_inches="tight", pad_inches=0.1, dpi=dpi, format=image_format + ) plt.close() @@ -804,8 +1035,7 @@ def mergeSmallGroups(matrixDict): if len(to_merge): to_merge.append(label) new_label = " ".join(to_merge) - new_ma = np.concatenate([matrixDict[item] - for item in to_merge], axis=0) + new_ma = np.concatenate([matrixDict[item] for item in to_merge], axis=0) else: new_label = label new_ma = matrixDict[label] @@ -832,24 +1062,39 @@ def main(args=None): args.matrixFile.close() hm.read_matrix_file(matrix_file) - if hm.parameters['min threshold'] is not None or hm.parameters['max threshold'] is not None: - filterHeatmapValues(hm, hm.parameters['min threshold'], hm.parameters['max threshold']) + if ( + hm.parameters["min threshold"] is not None + or hm.parameters["max threshold"] is not None + ): + filterHeatmapValues( + hm, hm.parameters["min threshold"], hm.parameters["max threshold"] + ) - if args.sortRegions == 'keep': - args.sortRegions = 'no' # These are the same thing + if args.sortRegions == "keep": + args.sortRegions = "no" # These are the same thing if args.kmeans is not None: - hm.matrix.hmcluster(args.kmeans, method='kmeans', clustering_samples=args.clusterUsingSamples) + hm.matrix.hmcluster( + args.kmeans, method="kmeans", clustering_samples=args.clusterUsingSamples + ) elif args.hclust is not None: - print("Performing hierarchical clustering." - "Please note that it might be very slow for large datasets.\n") - hm.matrix.hmcluster(args.hclust, method='hierarchical', clustering_samples=args.clusterUsingSamples) + print( + "Performing hierarchical clustering." + "Please note that it might be very slow for large datasets.\n" + ) + hm.matrix.hmcluster( + args.hclust, + method="hierarchical", + clustering_samples=args.clusterUsingSamples, + ) group_len_ratio = np.diff(hm.matrix.group_boundaries) / len(hm.matrix.regions) if np.any(group_len_ratio < 5.0 / 1000): problem = np.flatnonzero(group_len_ratio < 5.0 / 1000) - sys.stderr.write("WARNING: Group '{}' is too small for plotting, you might want to remove it. " - "There will likely be an error message from matplotlib regarding this " - "below.\n".format(hm.matrix.group_labels[problem[0]])) + sys.stderr.write( + "WARNING: Group '{}' is too small for plotting, you might want to remove it. " + "There will likely be an error message from matplotlib regarding this " + "below.\n".format(hm.matrix.group_labels[problem[0]]) + ) if args.regionsLabel: hm.matrix.set_group_labels(args.regionsLabel) @@ -857,19 +1102,25 @@ def main(args=None): if args.samplesLabel and len(args.samplesLabel): hm.matrix.set_sample_labels(args.samplesLabel) - if args.sortRegions != 'no': + if args.sortRegions != "no": sortUsingSamples = [] if args.sortUsingSamples is not None: for i in args.sortUsingSamples: - if (i > 0 and i <= hm.matrix.get_num_samples()): + if i > 0 and i <= hm.matrix.get_num_samples(): sortUsingSamples.append(i - 1) else: - exit("The value {0} for --sortSamples is not valid. Only values from 1 to {1} are allowed.".format(args.sortUsingSamples, hm.matrix.get_num_samples())) - print('Samples used for ordering within each group: ', sortUsingSamples) - - hm.matrix.sort_groups(sort_using=args.sortUsing, - sort_method=args.sortRegions, - sample_list=sortUsingSamples) + exit( + "The value {0} for --sortSamples is not valid. Only values from 1 to {1} are allowed.".format( + args.sortUsingSamples, hm.matrix.get_num_samples() + ) + ) + print("Samples used for ordering within each group: ", sortUsingSamples) + + hm.matrix.sort_groups( + sort_using=args.sortUsing, + sort_method=args.sortRegions, + sample_list=sortUsingSamples, + ) if args.silhouette: if args.kmeans is not None: @@ -883,31 +1134,40 @@ def main(args=None): if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions) - colormap_dict = {'colorMap': args.colorMap, - 'colorList': args.colorList, - 'colorNumber': args.colorNumber, - 'missingDataColor': args.missingDataColor, - 'alpha': args.alpha} - - plotMatrix(hm, - args.outFileName, - colormap_dict, args.plotTitle, - args.xAxisLabel, args.yAxisLabel, args.regionsLabel, - args.zMin, args.zMax, - args.yMin, args.yMax, - args.averageTypeSummaryPlot, - args.refPointLabel, - args.startLabel, - args.endLabel, - args.heatmapHeight, - args.heatmapWidth, - args.perGroup, - args.whatToShow, - linesAtTickMarks=args.linesAtTickMarks, - plot_type=args.plotType, - image_format=args.plotFileFormat, - legend_location=args.legendLocation, - box_around_heatmaps=args.boxAroundHeatmaps, - label_rotation=args.label_rotation, - dpi=args.dpi, - interpolation_method=args.interpolationMethod) + colormap_dict = { + "colorMap": args.colorMap, + "colorList": args.colorList, + "colorNumber": args.colorNumber, + "missingDataColor": args.missingDataColor, + "alpha": args.alpha, + } + + plotMatrix( + hm, + args.outFileName, + colormap_dict, + args.plotTitle, + args.xAxisLabel, + args.yAxisLabel, + args.regionsLabel, + args.zMin, + args.zMax, + args.yMin, + args.yMax, + args.averageTypeSummaryPlot, + args.refPointLabel, + args.startLabel, + args.endLabel, + args.heatmapHeight, + args.heatmapWidth, + args.perGroup, + args.whatToShow, + linesAtTickMarks=args.linesAtTickMarks, + plot_type=args.plotType, + image_format=args.plotFileFormat, + legend_location=args.legendLocation, + box_around_heatmaps=args.boxAroundHeatmaps, + label_rotation=args.label_rotation, + dpi=args.dpi, + interpolation_method=args.interpolationMethod, + ) diff --git a/deeptools/plotPCA.py b/deeptools/plotPCA.py index d12eac8d4..228b2c06f 100644 --- a/deeptools/plotPCA.py +++ b/deeptools/plotPCA.py @@ -4,9 +4,10 @@ import sys import argparse import matplotlib -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['svg.fonttype'] = 'none' + +matplotlib.use("Agg") +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["svg.fonttype"] = "none" from deeptools import cm # noqa: F401 from deeptools.correlation import Correlation @@ -27,129 +28,165 @@ def parse_arguments(args=None): plotPCA -h """, - epilog='example usages:\n' - 'plotPCA -in coverages.npz -o pca.png\n\n' - ' \n\n', - parents=[basic_args, ]) + epilog="example usages:\n" "plotPCA -in coverages.npz -o pca.png\n\n" " \n\n", + parents=[ + basic_args, + ], + ) return parser def plotCorrelationArgs(): parser = argparse.ArgumentParser(add_help=False) - required = parser.add_argument_group('Required arguments') + required = parser.add_argument_group("Required arguments") # define the arguments - required.add_argument('--corData', '-in', - metavar='FILE', - help='Coverage file (generated by multiBamSummary or multiBigwigSummary)', - required=True) - - optional = parser.add_argument_group('Optional arguments') - optional.add_argument('--plotFile', '-o', - help='File name to save the plot to. ' - 'The extension determines the file format. ' - 'For example: ' - 'pca.pdf will save the PCA plot in PDF format. ' - 'The available options are: .png, ' - '.eps, .pdf and .svg. If this option is omitted, then you MUST specify --outFileNameData', - type=writableFile, - metavar='FILE') - - optional.add_argument('--labels', '-l', - metavar='sample1 sample2', - help='User defined labels instead of default labels from ' - 'file names. ' - 'Multiple labels have to be separated by spaces, e.g. ' - '--labels sample1 sample2 sample3', - nargs='+') - - optional.add_argument('--plotTitle', '-T', - help='Title of the plot, to be printed on top of ' - 'the generated image. Leave blank for no title. (Default: %(default)s)', - default='') - - optional.add_argument('--plotFileFormat', - metavar='FILETYPE', - help='Image format type. If given, this option ' - 'overrides the image format based on the plotFile ' - 'ending. The available options are: png, ' - 'eps, pdf, plotly and svg.', - choices=['png', 'pdf', 'svg', 'eps', 'plotly']) - - optional.add_argument('--plotHeight', - help='Plot height in cm. (Default: %(default)s)', - type=float, - default=10) - - optional.add_argument('--plotWidth', - help='Plot width in cm. The minimum value is 1 cm. (Default: %(default)s)', - type=float, - default=10) - - optional.add_argument('--outFileNameData', - metavar='file.tab', - type=writableFile, - help='File name to which the data underlying the plot ' - 'should be saved, such as myPCA.tab. For untransposed ' - 'data, this is the loading per-sample and PC as well ' - 'as the eigenvalues. For transposed data, this is the ' - 'rotation per-sample and PC and the eigenvalues. The ' - 'projections are truncated to the number of ' - 'eigenvalues for transposed data.') - - optional.add_argument('--ntop', - help='Use only the top N most variable rows in the ' - 'original matrix. Specifying 0 will result in all ' - 'rows being used. If the matrix is to be transposed, ' - 'rows with 0 variance are always excluded, even if a ' - 'values of 0 is specified. The default is 1000. (Default: %(default)s)', - type=int, - default=1000) - - optional.add_argument('--PCs', - help='The principal components to plot. If specified, ' - 'you must provide two different integers, greater ' - 'than zero, separated by a space. An example (and the default) is "1 2". (Default: %(default)s)', - type=int, - nargs=2, - default=[1, 2]) - - optional.add_argument('--log2', - help='log2 transform the datapoints prior to computing ' - 'the PCA. Note that 0.01 is added to all values to ' - 'prevent 0 values from becoming -infinity. Using this ' - 'option with input that contains negative values will ' - 'result in an error.', - action='store_true') - - optional.add_argument('--colors', - metavar="COLORS", - nargs='+', - help="A list of colors for the symbols. Color names and html hex string (e.g., #eeff22) are accepted. The color names should be space separated. For example, --colors red blue green. If not specified, the symbols will be given automatic colors.") - - optional.add_argument('--markers', - metavar="MARKERS", - nargs='+', - help="A list of markers for the symbols. (e.g., '<','>','o') are accepted. The marker values should be space separated. For example, --markers 's' 'o' 's' 'o'. If not specified, the symbols will be given automatic shapes.") - - optional.add_argument('--version', action='version', - version='%(prog)s {}'.format(__version__)) + required.add_argument( + "--corData", + "-in", + metavar="FILE", + help="Coverage file (generated by multiBamSummary or multiBigwigSummary)", + required=True, + ) + + optional = parser.add_argument_group("Optional arguments") + optional.add_argument( + "--plotFile", + "-o", + help="File name to save the plot to. " + "The extension determines the file format. " + "For example: " + "pca.pdf will save the PCA plot in PDF format. " + "The available options are: .png, " + ".eps, .pdf and .svg. If this option is omitted, then you MUST specify --outFileNameData", + type=writableFile, + metavar="FILE", + ) + + optional.add_argument( + "--labels", + "-l", + metavar="sample1 sample2", + help="User defined labels instead of default labels from " + "file names. " + "Multiple labels have to be separated by spaces, e.g. " + "--labels sample1 sample2 sample3", + nargs="+", + ) + + optional.add_argument( + "--plotTitle", + "-T", + help="Title of the plot, to be printed on top of " + "the generated image. Leave blank for no title. (Default: %(default)s)", + default="", + ) + + optional.add_argument( + "--plotFileFormat", + metavar="FILETYPE", + help="Image format type. If given, this option " + "overrides the image format based on the plotFile " + "ending. The available options are: png, " + "eps, pdf, plotly and svg.", + choices=["png", "pdf", "svg", "eps", "plotly"], + ) + + optional.add_argument( + "--plotHeight", + help="Plot height in cm. (Default: %(default)s)", + type=float, + default=10, + ) + + optional.add_argument( + "--plotWidth", + help="Plot width in cm. The minimum value is 1 cm. (Default: %(default)s)", + type=float, + default=10, + ) + + optional.add_argument( + "--outFileNameData", + metavar="file.tab", + type=writableFile, + help="File name to which the data underlying the plot " + "should be saved, such as myPCA.tab. For untransposed " + "data, this is the loading per-sample and PC as well " + "as the eigenvalues. For transposed data, this is the " + "rotation per-sample and PC and the eigenvalues. The " + "projections are truncated to the number of " + "eigenvalues for transposed data.", + ) + + optional.add_argument( + "--ntop", + help="Use only the top N most variable rows in the " + "original matrix. Specifying 0 will result in all " + "rows being used. If the matrix is to be transposed, " + "rows with 0 variance are always excluded, even if a " + "values of 0 is specified. The default is 1000. (Default: %(default)s)", + type=int, + default=1000, + ) + + optional.add_argument( + "--PCs", + help="The principal components to plot. If specified, " + "you must provide two different integers, greater " + 'than zero, separated by a space. An example (and the default) is "1 2". (Default: %(default)s)', + type=int, + nargs=2, + default=[1, 2], + ) + + optional.add_argument( + "--log2", + help="log2 transform the datapoints prior to computing " + "the PCA. Note that 0.01 is added to all values to " + "prevent 0 values from becoming -infinity. Using this " + "option with input that contains negative values will " + "result in an error.", + action="store_true", + ) + + optional.add_argument( + "--colors", + metavar="COLORS", + nargs="+", + help="A list of colors for the symbols. Color names and html hex string (e.g., #eeff22) are accepted. The color names should be space separated. For example, --colors red blue green. If not specified, the symbols will be given automatic colors.", + ) + + optional.add_argument( + "--markers", + metavar="MARKERS", + nargs="+", + help="A list of markers for the symbols. (e.g., '<','>','o') are accepted. The marker values should be space separated. For example, --markers 's' 'o' 's' 'o'. If not specified, the symbols will be given automatic shapes.", + ) + + optional.add_argument( + "--version", action="version", version="%(prog)s {}".format(__version__) + ) optionalEx = optional.add_mutually_exclusive_group() - optionalEx.add_argument('--transpose', - help='Perform the PCA on the transposed matrix, (i.e., on the ' - 'matrix where rows are samples and columns are ' - 'bins/features. This then matches what is typically ' - 'done in R.', - action='store_true') - - optionalEx.add_argument('--rowCenter', - help='When specified, each row (bin, gene, etc.) ' - 'in the matrix is centered at 0 before the PCA is ' - 'computed. This is useful only if you have a strong ' - 'bin/gene/etc. correlation and the resulting ' - 'principal component has samples stacked vertically. This option is not applicable if --transpose is specified.', - action='store_true') + optionalEx.add_argument( + "--transpose", + help="Perform the PCA on the transposed matrix, (i.e., on the " + "matrix where rows are samples and columns are " + "bins/features. This then matches what is typically " + "done in R.", + action="store_true", + ) + + optionalEx.add_argument( + "--rowCenter", + help="When specified, each row (bin, gene, etc.) " + "in the matrix is centered at 0 before the PCA is " + "computed. This is useful only if you have a strong " + "bin/gene/etc. correlation and the resulting " + "principal component has samples stacked vertically. This option is not applicable if --transpose is specified.", + action="store_true", + ) return parser @@ -158,7 +195,9 @@ def main(args=None): args = parse_arguments().parse_args(args) if args.plotFile is None and args.outFileNameData is None: - sys.exit("At least one of --plotFile and --outFileNameData must be specified!\n") + sys.exit( + "At least one of --plotFile and --outFileNameData must be specified!\n" + ) if args.ntop < 0: sys.exit("The value specified for --ntop must be >= 0!\n") @@ -168,22 +207,26 @@ def main(args=None): if args.PCs[0] <= 0 or args.PCs[1] <= 0: sys.exit("The specified principal components must be at least 1!\n") - corr = Correlation(args.corData, - labels=args.labels,) + corr = Correlation( + args.corData, + labels=args.labels, + ) corr.rowCenter = args.rowCenter corr.transpose = args.transpose corr.ntop = args.ntop corr.log2 = args.log2 - Wt, eigenvalues = corr.plot_pca(args.plotFile, - PCs=args.PCs, - plot_title=args.plotTitle, - image_format=args.plotFileFormat, - plotWidth=args.plotWidth, - plotHeight=args.plotHeight, - cols=args.colors, - marks=args.markers) + Wt, eigenvalues = corr.plot_pca( + args.plotFile, + PCs=args.PCs, + plot_title=args.plotTitle, + image_format=args.plotFileFormat, + plotWidth=args.plotWidth, + plotHeight=args.plotHeight, + cols=args.colors, + marks=args.markers, + ) if args.outFileNameData is not None: of = open(args.outFileNameData, "w") @@ -191,7 +234,11 @@ def main(args=None): of.write("Component\t{}\tEigenvalue\n".format("\t".join(corr.labels))) n = eigenvalues.shape[0] for i in range(n): - of.write("{}\t{}\t{}\n".format(i + 1, "\t".join(["{}".format(x) for x in Wt[i, :]]), eigenvalues[i])) + of.write( + "{}\t{}\t{}\n".format( + i + 1, "\t".join(["{}".format(x) for x in Wt[i, :]]), eigenvalues[i] + ) + ) of.close() diff --git a/deeptools/plotProfile.py b/deeptools/plotProfile.py index ce49e82ab..040e172b3 100755 --- a/deeptools/plotProfile.py +++ b/deeptools/plotProfile.py @@ -8,9 +8,11 @@ import numpy as np from math import ceil import matplotlib -matplotlib.use('Agg') -matplotlib.rcParams['pdf.fonttype'] = 42 -matplotlib.rcParams['svg.fonttype'] = 'none' +import copy, re + +matplotlib.use("Agg") +matplotlib.rcParams["pdf.fonttype"] = 42 +matplotlib.rcParams["svg.fonttype"] = "none" import deeptools.cm # noqa: F401 import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties @@ -28,29 +30,33 @@ debug = 0 -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") plt.ioff() def parse_arguments(args=None): parser = argparse.ArgumentParser( - parents=[parserCommon.heatmapperMatrixArgs(), - parserCommon.heatmapperOutputArgs(mode='profile'), - parserCommon.heatmapperOptionalArgs(mode='profile')], + parents=[ + parserCommon.heatmapperMatrixArgs(), + parserCommon.heatmapperOutputArgs(mode="profile"), + parserCommon.heatmapperOptionalArgs(mode="profile"), + ], formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description='This tool creates a profile plot for ' - 'scores over sets of genomic regions. ' - 'Typically, these regions are genes, but ' - 'any other regions defined in BED ' - ' will work. A matrix generated ' - 'by computeMatrix is required.', - epilog='An example usage is: plotProfile -m ', - add_help=False) + description="This tool creates a profile plot for " + "scores over sets of genomic regions. " + "Typically, these regions are genes, but " + "any other regions defined in BED " + " will work. A matrix generated " + "by computeMatrix is required.", + epilog="An example usage is: plotProfile -m ", + add_help=False, + ) return parser -def autobreaklinetitle(title,sep="[-_,.]",lmax=15): - sss = [ rr for rr in re.split(sep,title) if len(rr) ] + +def autobreaklinetitle(title, sep="[-_,.]", lmax=15): + sss = [rr for rr in re.split(sep, title) if len(rr)] newtitle, tmp = "", "" for ss in sss: tmp += ss @@ -62,23 +68,24 @@ def autobreaklinetitle(title,sep="[-_,.]",lmax=15): newtitle += tmp.strip("-") + "\n" return newtitle + def process_args(args=None): args = parse_arguments().parse_args(args) # Ensure that yMin/yMax are there and a list try: - assert(args.yMin is not None) + assert args.yMin is not None except: args.yMin = [None] try: - assert(args.yMax is not None) + assert args.yMax is not None except: args.yMax = [None] # Sometimes Galaxy sends --yMax '' and --yMin '' - if args.yMin == ['']: + if args.yMin == [""]: args.yMin = [None] - if args.yMax == ['']: + if args.yMax == [""]: args.yMax = [None] # Convert to floats @@ -98,23 +105,29 @@ def process_args(args=None): class Profile(object): - - def __init__(self, hm, out_file_name, - plot_title='', y_axis_label='', - y_min=None, y_max=None, - averagetype='median', - reference_point_label=None, - start_label='TSS', end_label='TES', - plot_height=7, - plot_width=11, - per_group=False, - plot_type='lines', - image_format=None, - color_list=None, - legend_location='best', - plots_per_row=8, - label_rotation=0, - dpi=200): + def __init__( + self, + hm, + out_file_name, + plot_title="", + y_axis_label="", + y_min=None, + y_max=None, + averagetype="median", + reference_point_label=None, + start_label="TSS", + end_label="TES", + plot_height=7, + plot_width=11, + per_group=False, + plot_type="lines", + image_format=None, + color_list=None, + legend_location="best", + plots_per_row=8, + label_rotation=0, + dpi=200, + ): """ Using the hm matrix, makes a line plot either per group or per sample @@ -167,7 +180,7 @@ def __init__(self, hm, out_file_name, # Honor reference point labels from computeMatrix if reference_point_label is None: - self.reference_point_label = hm.parameters['ref point'] + self.reference_point_label = hm.parameters["ref point"] # decide how many plots are needed if self.per_group: @@ -185,13 +198,15 @@ def __init__(self, hm, out_file_name, cols = self.numplots self.grids = gridspec.GridSpec(rows, cols) - plt.rcParams['font.size'] = 10.0 + plt.rcParams["font.size"] = 10.0 self.font_p = FontProperties() - self.font_p.set_size('small') + self.font_p.set_size("small") # convert cm values to inches plot_height_inches = rows * self.cm2inch(self.plot_height)[0] - self.fig = plt.figure(figsize=self.cm2inch(cols * self.plot_width, rows * self.plot_height)) + self.fig = plt.figure( + figsize=self.cm2inch(cols * self.plot_width, rows * self.plot_height) + ) self.fig.suptitle(self.plot_title, y=(1 - (0.06 / plot_height_inches))) # Ensure that the labels are vectors @@ -207,7 +222,13 @@ def getTicks(self, idx): """ This is essentially a wrapper around getProfileTicks to accomdate the fact that each column has its own ticks. """ - xticks, xtickslabel = getProfileTicks(self.hm, self.reference_point_label[idx], self.start_label[idx], self.end_label[idx], idx) + xticks, xtickslabel = getProfileTicks( + self.hm, + self.reference_point_label[idx], + self.start_label[idx], + self.end_label[idx], + idx, + ) return xticks, xtickslabel @staticmethod @@ -220,8 +241,9 @@ def cm2inch(*tupl): def plot_hexbin(self): from matplotlib import cm + cmap = cm.coolwarm - cmap.set_bad('black') + cmap.set_bad("black") if self.image_format == "plotly": return self.plotly_hexbin() @@ -234,17 +256,20 @@ def plot_hexbin(self): # split the ax to make room for the colorbar and for each of the # groups - sub_grid = gridspec.GridSpecFromSubplotSpec(self.numlines, 2, subplot_spec=self.grids[row, col], - width_ratios=[0.92, 0.08], wspace=0.05, hspace=0.1) + sub_grid = gridspec.GridSpecFromSubplotSpec( + self.numlines, + 2, + subplot_spec=self.grids[row, col], + width_ratios=[0.92, 0.08], + wspace=0.05, + hspace=0.1, + ) ax = self.fig.add_subplot(sub_grid[0, 0]) ax.tick_params( - axis='y', - which='both', - left=False, - right=False, - labelleft=True) + axis="y", which="both", left=False, right=False, labelleft=True + ) if self.per_group: title = self.hm.matrix.group_labels[plot] @@ -261,7 +286,7 @@ def plot_hexbin(self): _row, _col = data_idx, plot sub_matrix = self.hm.matrix.get_matrix(_row, _col) - ma = sub_matrix['matrix'] + ma = sub_matrix["matrix"] x_values = np.tile(np.arange(ma.shape[1]), (ma.shape[0], 1)) img = ax.hexbin(x_values.flatten(), ma.flatten(), cmap=cmap, mincnt=1) _vmin, _vmax = img.get_clim() @@ -293,19 +318,26 @@ def plot_hexbin(self): sub_matrix = self.hm.matrix.get_matrix(_row, _col) if self.per_group: - label = sub_matrix['sample'] + label = sub_matrix["sample"] else: - label = sub_matrix['group'] + label = sub_matrix["group"] - ma = sub_matrix['matrix'] + ma = sub_matrix["matrix"] try: # matplotlib 2.0 - ax.set_facecolor('black') + ax.set_facecolor("black") except: # matplotlib <2.0 - ax.set_axis_bgcolor('black') + ax.set_axis_bgcolor("black") x_values = np.tile(np.arange(ma.shape[1]), (ma.shape[0], 1)) - img = ax.hexbin(x_values.flatten(), ma.flatten(), cmap=cmap, mincnt=1, vmin=vmin, vmax=vmax) + img = ax.hexbin( + x_values.flatten(), + ma.flatten(), + cmap=cmap, + mincnt=1, + vmin=vmin, + vmax=vmax, + ) if plot == 0: ax.axes.set_ylabel(label) @@ -323,7 +355,7 @@ def plot_hexbin(self): xticks, xtickslabel = self.getTicks(plot) if np.ceil(max(xticks)) != float(ma.shape[1] - 1): - tickscale = float(sub_matrix['matrix'].shape[1]) / max(xticks) + tickscale = float(sub_matrix["matrix"].shape[1]) / max(xticks) xticks_use = [x * tickscale for x in xticks] ax_list[0].axes.set_xticks(xticks_use) else: @@ -333,8 +365,8 @@ def plot_hexbin(self): # such that they don't fall off # the heatmap sides ticks = ax_list[-1].xaxis.get_major_ticks() - ticks[0].label1.set_horizontalalignment('left') - ticks[-1].label1.set_horizontalalignment('right') + ticks[0].label1.set_horizontalalignment("left") + ticks[-1].label1.set_horizontalalignment("right") cax = self.fig.add_subplot(sub_grid[:, 1]) self.fig.colorbar(img, cax=cax) @@ -347,11 +379,13 @@ def plot_hexbin(self): def plotly_hexbin(self): """plot_hexbin, but for plotly. it's annoying that we have to have sub-subplots""" fig = go.Figure() - cols = self.plots_per_row if self.numplots > self.plots_per_row else self.numplots + cols = ( + self.plots_per_row if self.numplots > self.plots_per_row else self.numplots + ) rows = np.ceil(self.numplots / float(cols)).astype(int) - fig['layout'].update(title=self.plot_title) - domainWidth = .9 / cols - domainHeight = .9 / rows + fig["layout"].update(title=self.plot_title) + domainWidth = 0.9 / cols + domainHeight = 0.9 / rows bufferHeight = 0.0 if rows > 1: bufferHeight = 0.1 / (rows - 1) @@ -387,9 +421,21 @@ def plotly_hexbin(self): base = col * (domainWidth + bufferWidth) domain = [base, base + domainWidth] titleX = base + 0.5 * domainWidth - xanchor = 'x{}'.format(i + 1) - fig['layout']['xaxis{}'.format(i + 1)] = dict(domain=domain) - annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': title, 'y': titleY, 'x': titleX, 'font': {'size': 16}, 'showarrow': False}) + xanchor = "x{}".format(i + 1) + fig["layout"]["xaxis{}".format(i + 1)] = dict(domain=domain) + annos.append( + { + "yanchor": "bottom", + "xref": "paper", + "xanchor": "center", + "yref": "paper", + "text": title, + "y": titleY, + "x": titleX, + "font": {"size": 16}, + "showarrow": False, + } + ) # set yMin/yMax yMin = np.inf @@ -401,7 +447,7 @@ def plotly_hexbin(self): else: _row, _col = j, i - ma = self.hm.matrix.get_matrix(_row, _col)['matrix'] + ma = self.hm.matrix.get_matrix(_row, _col)["matrix"] if np.min(ma) < yMin: yMin = np.min(ma) if np.max(ma) > yMax: @@ -417,30 +463,53 @@ def plotly_hexbin(self): else: _row, _col = j, i foo = i * self.numlines + j + 1 - yanchor = 'y{}'.format(foo) + yanchor = "y{}".format(foo) base = row * (domainHeight + bufferHeight) + j * subHeight domain = [base, base + subHeight] - fig['layout']['yaxis{}'.format(foo)] = {'domain': domain, 'title': self.y_axis_label, 'anchor': xanchor, 'range': [yMin, yMax]} + fig["layout"]["yaxis{}".format(foo)] = { + "domain": domain, + "title": self.y_axis_label, + "anchor": xanchor, + "range": [yMin, yMax], + } if j == 0: _ = "xaxis{}".format(xanchor[1:]) - fig['layout'][_].update(anchor='y{}'.format(foo)) + fig["layout"][_].update(anchor="y{}".format(foo)) if col == 0: titleY = base + 0.5 * subHeight - annos.append({'yanchor': 'middle', 'xref': 'paper', 'xanchor': 'left', 'yref': 'paper', 'text': sideLabels[j], 'y': titleY, 'x': -0.03, 'font': {'size': 16}, 'showarrow': False, 'textangle': -90}) + annos.append( + { + "yanchor": "middle", + "xref": "paper", + "xanchor": "left", + "yref": "paper", + "text": sideLabels[j], + "y": titleY, + "x": -0.03, + "font": {"size": 16}, + "showarrow": False, + "textangle": -90, + } + ) sub_matrix = self.hm.matrix.get_matrix(_row, _col) - ma = self.hm.matrix.get_matrix(_row, _col)['matrix'] + ma = self.hm.matrix.get_matrix(_row, _col)["matrix"] - fig['layout']['xaxis{}'.format(i + 1)].update(range=[0, ma.shape[1]]) + fig["layout"]["xaxis{}".format(i + 1)].update(range=[0, ma.shape[1]]) if self.per_group: - label = sub_matrix['sample'] + label = sub_matrix["sample"] else: - label = sub_matrix['group'] + label = sub_matrix["group"] # Manually compute the 2D histogram with 100x100 bins x_values = np.tile(np.arange(ma.shape[1]), (ma.shape[0], 1)) - z, xe, ye = np.histogram2d(x_values.flatten(), ma.flatten(), bins=100, range=[[0, ma.shape[1]], [yMin, yMax]]) + z, xe, ye = np.histogram2d( + x_values.flatten(), + ma.flatten(), + bins=100, + range=[[0, ma.shape[1]], [yMin, yMax]], + ) _vmin = np.min(z) _vmax = np.max(z) @@ -449,7 +518,15 @@ def plotly_hexbin(self): if _vmax > vmax: vmax = _vmax - trace = go.Contour(z=z.T, x=xe, y=ye, xaxis=xanchor, yaxis=yanchor, name=label, connectgaps=False) + trace = go.Contour( + z=z.T, + x=xe, + y=ye, + xaxis=xanchor, + yaxis=yanchor, + name=label, + connectgaps=False, + ) data.append(trace) # Assume the bounds for the last graph are correct @@ -461,18 +538,25 @@ def plotly_hexbin(self): else: xticks_use = xticks xticks_use = [np.ceil(x) for x in xticks_use] - fig['layout']['xaxis{}'.format(i + 1)].update(tickmode='array', tickvals=xticks_use, ticktext=xtickslabel, tickangle=self.label_rotation) + fig["layout"]["xaxis{}".format(i + 1)].update( + tickmode="array", + tickvals=xticks_use, + ticktext=xtickslabel, + tickangle=self.label_rotation, + ) for trace in data: trace.update(zmin=vmin, zmax=vmax) fig.add_traces(data) - fig['layout']['annotations'] = annos + fig["layout"]["annotations"] = annos py.plot(fig, filename=self.out_file_name, auto_open=False) def plot_heatmap(self): - cmap = ['RdYlBu_r'] - if self.color_list is not None: # check the length to be equal to the numebr of plots otherwise multiply it! + cmap = ["RdYlBu_r"] + if ( + self.color_list is not None + ): # check the length to be equal to the numebr of plots otherwise multiply it! cmap = self.color_list if len(cmap) < self.numplots: all_colors = cmap @@ -507,18 +591,20 @@ def plot_heatmap(self): localYMax = None # split the ax to make room for the colorbar - sub_grid = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=self.grids[row, col], - width_ratios=[0.92, 0.08], wspace=0.05) + sub_grid = gridspec.GridSpecFromSubplotSpec( + 1, + 2, + subplot_spec=self.grids[row, col], + width_ratios=[0.92, 0.08], + wspace=0.05, + ) ax = self.fig.add_subplot(sub_grid[0]) cax = self.fig.add_subplot(sub_grid[1]) ax.tick_params( - axis='y', - which='both', - left=False, - right=False, - labelleft=True) + axis="y", which="both", left=False, right=False, labelleft=True + ) if self.per_group: title = self.hm.matrix.group_labels[plot] @@ -541,13 +627,23 @@ def plot_heatmap(self): sub_matrix = self.hm.matrix.get_matrix(row, col) if self.per_group: - label = sub_matrix['sample'] + label = sub_matrix["sample"] else: - label = sub_matrix['group'] + label = sub_matrix["group"] labels.append(label) - mat.append(np.ma.__getattribute__(self.averagetype)(sub_matrix['matrix'], axis=0)) - img = ax.imshow(np.vstack(mat), interpolation='nearest', - cmap=cmap[plot], aspect='auto', vmin=localYMin, vmax=localYMax) + mat.append( + np.ma.__getattribute__(self.averagetype)( + sub_matrix["matrix"], axis=0 + ) + ) + img = ax.imshow( + np.vstack(mat), + interpolation="nearest", + cmap=cmap[plot], + aspect="auto", + vmin=localYMin, + vmax=localYMax, + ) self.fig.colorbar(img, cax=cax) totalWidth = np.vstack(mat).shape[1] @@ -563,12 +659,14 @@ def plot_heatmap(self): # such that they don't fall off # the heatmap sides ticks = ax.xaxis.get_major_ticks() - ticks[0].label1.set_horizontalalignment('left') - ticks[-1].label1.set_horizontalalignment('right') + ticks[0].label1.set_horizontalalignment("left") + ticks[-1].label1.set_horizontalalignment("right") # add labels as y ticks labels ymin, ymax = ax.axes.get_ylim() - pos, distance = np.linspace(ymin, ymax, len(labels), retstep=True, endpoint=False) + pos, distance = np.linspace( + ymin, ymax, len(labels), retstep=True, endpoint=False + ) d_half = float(distance) / 2 yticks = [x + d_half for x in pos] @@ -592,11 +690,13 @@ def plot_heatmap(self): def plotly_heatmap(self): """plot_heatmap, but with plotly output""" fig = go.Figure() - cols = self.plots_per_row if self.numplots > self.plots_per_row else self.numplots + cols = ( + self.plots_per_row if self.numplots > self.plots_per_row else self.numplots + ) rows = np.ceil(self.numplots / float(cols)).astype(int) - fig['layout'].update(title=self.plot_title) - domainWidth = .9 / cols - domainHeight = .9 / rows + fig["layout"].update(title=self.plot_title) + domainWidth = 0.9 / cols + domainHeight = 0.9 / rows bufferHeight = 0.0 if rows > 1: bufferHeight = 0.1 / (rows - 1) @@ -624,17 +724,36 @@ def plotly_heatmap(self): base = row * (domainHeight + bufferHeight) domain = [base, base + domainHeight] titleY = base + domainHeight - xanchor = 'x{}'.format(i + 1) - yanchor = 'y{}'.format(i + 1) + xanchor = "x{}".format(i + 1) + yanchor = "y{}".format(i + 1) visible = False if col == 0: visible = True - fig['layout']['yaxis{}'.format(i + 1)] = {'domain': domain, 'anchor': xanchor, 'visible': visible} + fig["layout"]["yaxis{}".format(i + 1)] = { + "domain": domain, + "anchor": xanchor, + "visible": visible, + } base = col * (domainWidth + bufferWidth) domain = [base, base + domainWidth] titleX = base + 0.5 * domainWidth - fig['layout']['xaxis{}'.format(i + 1)] = {'domain': domain, 'anchor': yanchor} - annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': title, 'y': titleY, 'x': titleX, 'font': {'size': 16}, 'showarrow': False}) + fig["layout"]["xaxis{}".format(i + 1)] = { + "domain": domain, + "anchor": yanchor, + } + annos.append( + { + "yanchor": "bottom", + "xref": "paper", + "xanchor": "center", + "yref": "paper", + "text": title, + "y": titleY, + "x": titleX, + "font": {"size": 16}, + "showarrow": False, + } + ) mat = [] labels = [] @@ -647,17 +766,28 @@ def plotly_heatmap(self): sub_matrix = self.hm.matrix.get_matrix(row, col) if self.per_group: - label = sub_matrix['sample'] + label = sub_matrix["sample"] else: - label = sub_matrix['group'] + label = sub_matrix["group"] labels.append(label) - mat.append(np.ma.__getattribute__(self.averagetype)(sub_matrix['matrix'], axis=0)) + mat.append( + np.ma.__getattribute__(self.averagetype)( + sub_matrix["matrix"], axis=0 + ) + ) if np.min(mat[-1]) < zmin: zmin = np.min(mat[-1]) if np.max(mat[-1]) > zmax: zmax = np.max(mat[-1]) totalWidth = len(mat[-1]) - trace = go.Heatmap(name=title, z=mat, x=range(totalWidth + 1), y=labels, xaxis=xanchor, yaxis=yanchor) + trace = go.Heatmap( + name=title, + z=mat, + x=range(totalWidth + 1), + y=labels, + xaxis=xanchor, + yaxis=yanchor, + ) data.append(trace) # Add ticks @@ -668,7 +798,12 @@ def plotly_heatmap(self): else: xticks_use = xticks xticks_use = [np.ceil(x) for x in xticks_use] - fig['layout']['xaxis{}'.format(i + 1)].update(tickmode='array', tickvals=xticks_use, ticktext=xtickslabel, tickangle=self.label_rotation) + fig["layout"]["xaxis{}".format(i + 1)].update( + tickmode="array", + tickvals=xticks_use, + ticktext=xtickslabel, + tickangle=self.label_rotation, + ) # Adjust color scale limits for i, trace in enumerate(data): @@ -681,7 +816,7 @@ def plotly_heatmap(self): trace.update(zmin=zminUse, zmax=zmaxUse) fig.add_traces(data) - fig['layout']['annotations'] = annos + fig["layout"]["annotations"] = annos py.plot(fig, filename=self.out_file_name, auto_open=False) def plot_profile(self): @@ -691,21 +826,30 @@ def plot_profile(self): self.y_max = [None] if not self.color_list: - cmap_plot = plt.get_cmap('jet') + cmap_plot = plt.get_cmap("jet") if self.numlines > 1: # kmeans, so we need to color by cluster - self.color_list = cmap_plot(np.arange(self.numlines, dtype=float) / float(self.numlines)) + self.color_list = cmap_plot( + np.arange(self.numlines, dtype=float) / float(self.numlines) + ) else: - self.color_list = cmap_plot(np.arange(self.numplots, dtype=float) / float(self.numplots)) - if (self.numlines > 1 and len(self.color_list) < self.numlines) or\ - (self.numlines == 1 and len(self.color_list) < self.numplots): - sys.exit("\nThe given list of colors is too small, " - "at least {} colors are needed\n".format(self.numlines)) + self.color_list = cmap_plot( + np.arange(self.numplots, dtype=float) / float(self.numplots) + ) + if (self.numlines > 1 and len(self.color_list) < self.numlines) or ( + self.numlines == 1 and len(self.color_list) < self.numplots + ): + sys.exit( + "\nThe given list of colors is too small, " + "at least {} colors are needed\n".format(self.numlines) + ) for color in self.color_list: if not pltcolors.is_color_like(color): - sys.exit("\nThe color name {} is not valid. Check " - "the name or try with a html hex string " - "for example #eeff22".format(color)) + sys.exit( + "\nThe color name {} is not valid. Check " + "the name or try with a html hex string " + "for example #eeff22".format(color) + ) if self.image_format == "plotly": return self.plotly_profile() @@ -749,19 +893,22 @@ def plot_profile(self): sub_matrix = self.hm.matrix.get_matrix(_row, _col) if self.per_group: - label = sub_matrix['sample'] + label = sub_matrix["sample"] else: - label = sub_matrix['group'] + label = sub_matrix["group"] if self.numlines > 1: coloridx = data_idx else: coloridx = plot - plot_single(ax, sub_matrix['matrix'], - self.averagetype, - self.color_list[coloridx], - label, - plot_type=self.plot_type) + plot_single( + ax, + sub_matrix["matrix"], + self.averagetype, + self.color_list[coloridx], + label, + plot_type=self.plot_type, + ) globalYmin = min(np.float64(globalYmin), ax.get_ylim()[0]) globalYmax = max(globalYmax, ax.get_ylim()[1]) @@ -769,7 +916,7 @@ def plot_profile(self): if col > 0 and len(self.y_min) == 1 and len(self.y_max) == 1: plt.setp(ax.get_yticklabels(), visible=False) - totalWidth = sub_matrix['matrix'].shape[1] + totalWidth = sub_matrix["matrix"].shape[1] xticks, xtickslabel = self.getTicks(tickIdx) if np.ceil(max(xticks)) != float(totalWidth - 1): tickscale = float(totalWidth) / max(xticks) @@ -782,15 +929,19 @@ def plot_profile(self): # such that they don't fall off # the heatmap sides ticks = ax.xaxis.get_major_ticks() - ticks[0].label1.set_horizontalalignment('left') - ticks[-1].label1.set_horizontalalignment('right') + ticks[0].label1.set_horizontalalignment("left") + ticks[-1].label1.set_horizontalalignment("right") - if first and self.y_axis_label != '': + if first and self.y_axis_label != "": ax.set_ylabel(self.y_axis_label) - if first and self.plot_type not in ['heatmap', 'overlapped_lines']: - ax.legend(loc=self.legend_location.replace('-', ' '), - ncol=1, prop=self.font_p, - frameon=False, markerscale=0.5) + if first and self.plot_type not in ["heatmap", "overlapped_lines"]: + ax.legend( + loc=self.legend_location.replace("-", " "), + ncol=1, + prop=self.font_p, + frameon=False, + markerscale=0.5, + ) if len(self.y_min) == 1 and len(self.y_max) == 1: first = False ax_list.append(ax) @@ -823,11 +974,13 @@ def plotly_profile(self): y_min, y_max, and color_list are set already """ fig = go.Figure() - cols = self.plots_per_row if self.numplots > self.plots_per_row else self.numplots + cols = ( + self.plots_per_row if self.numplots > self.plots_per_row else self.numplots + ) rows = np.ceil(self.numplots / float(cols)).astype(int) - fig['layout'].update(title=self.plot_title) - domainWidth = .9 / cols - domainHeight = .9 / rows + fig["layout"].update(title=self.plot_title) + domainWidth = 0.9 / cols + domainHeight = 0.9 / rows bufferHeight = 0.0 if rows > 1: bufferHeight = 0.1 / (rows - 1) @@ -843,22 +996,42 @@ def plotly_profile(self): row = np.floor(i / self.plots_per_row) # row = rows - i / self.plots_per_row - 1 col = i % self.plots_per_row - xanchor = 'x{}'.format(i + 1) - yanchor = 'y{}'.format(i + 1) + xanchor = "x{}".format(i + 1) + yanchor = "y{}".format(i + 1) base = row * (domainHeight + bufferHeight) domain = [base, base + domainHeight] titleY = base + domainHeight - fig['layout']['yaxis{}'.format(i + 1)] = {'domain': domain, 'title': self.y_axis_label, 'anchor': xanchor, 'autorange': False} + fig["layout"]["yaxis{}".format(i + 1)] = { + "domain": domain, + "title": self.y_axis_label, + "anchor": xanchor, + "autorange": False, + } base = col * (domainWidth + bufferWidth) domain = [base, base + domainWidth] titleX = base + 0.5 * domainWidth - fig['layout']['xaxis{}'.format(i + 1)] = {'domain': domain, 'anchor': yanchor} + fig["layout"]["xaxis{}".format(i + 1)] = { + "domain": domain, + "anchor": yanchor, + } if self.per_group: title = self.hm.matrix.group_labels[i] else: title = self.hm.matrix.sample_labels[i] - annos.append({'yanchor': 'bottom', 'xref': 'paper', 'xanchor': 'center', 'yref': 'paper', 'text': title, 'y': titleY, 'x': titleX, 'font': {'size': 16}, 'showarrow': False}) + annos.append( + { + "yanchor": "bottom", + "xref": "paper", + "xanchor": "center", + "yref": "paper", + "text": title, + "y": titleY, + "x": titleX, + "font": {"size": 16}, + "showarrow": False, + } + ) for j in range(self.numlines): if self.per_group: @@ -867,33 +1040,37 @@ def plotly_profile(self): _row, _col = j, i sub_matrix = self.hm.matrix.get_matrix(_row, _col) - fig['layout']['xaxis{}'.format(i + 1)].update(range=[0, sub_matrix['matrix'].shape[1]]) + fig["layout"]["xaxis{}".format(i + 1)].update( + range=[0, sub_matrix["matrix"].shape[1]] + ) if self.per_group: - label = sub_matrix['sample'] + label = sub_matrix["sample"] else: - label = sub_matrix['group'] + label = sub_matrix["group"] if self.numlines > 1: coloridx = j else: coloridx = i color = self.color_list[coloridx] - traces = plotly_single(sub_matrix['matrix'], - self.averagetype, - color, - label, - plot_type=self.plot_type) + traces = plotly_single( + sub_matrix["matrix"], + self.averagetype, + color, + label, + plot_type=self.plot_type, + ) for trace in traces: trace.update(xaxis=xanchor, yaxis=yanchor) - if yMin is None or min(trace['y']) < yMin: - yMin = min(trace['y']) - if yMax is None or max(trace['y']) > yMax: - yMax = max(trace['y']) + if yMin is None or min(trace["y"]) < yMin: + yMin = min(trace["y"]) + if yMax is None or max(trace["y"]) > yMax: + yMax = max(trace["y"]) if row == col == 0: traces[0].update(showlegend=True) data.extend(traces) - totalWidth = sub_matrix['matrix'].shape[1] + totalWidth = sub_matrix["matrix"].shape[1] xticks, xtickslabel = self.getTicks(i) if np.ceil(max(xticks)) != float(totalWidth): tickscale = float(totalWidth) / max(xticks) @@ -901,20 +1078,25 @@ def plotly_profile(self): else: xticks_use = xticks xticks_use = [np.ceil(x) for x in xticks_use] - fig['layout']['xaxis{}'.format(i + 1)].update(tickmode='array', tickvals=xticks_use, ticktext=xtickslabel, tickangle=self.label_rotation) + fig["layout"]["xaxis{}".format(i + 1)].update( + tickmode="array", + tickvals=xticks_use, + ticktext=xtickslabel, + tickangle=self.label_rotation, + ) # Set the y limits for i in range(self.numplots): - yaxis = 'yaxis{}'.format(i + 1) + yaxis = "yaxis{}".format(i + 1) yRange = [yMin, yMax] if self.y_min[i % len(self.y_min)] is not None: yRange[0] = self.y_min[i % len(self.y_min)] if self.y_max[i % len(self.y_max)] is not None: yRange[1] = self.y_max[i % len(self.y_max)] - fig['layout'][yaxis].update(range=yRange) + fig["layout"][yaxis].update(range=yRange) fig.add_traces(data) - fig['layout']['annotations'] = annos + fig["layout"]["annotations"] = annos py.plot(fig, filename=self.out_file_name, auto_open=False) @@ -925,21 +1107,40 @@ def main(args=None): args.matrixFile.close() hm.read_matrix_file(matrix_file) - if hm.parameters['min threshold'] is not None or hm.parameters['max threshold'] is not None: - filterHeatmapValues(hm, hm.parameters['min threshold'], hm.parameters['max threshold']) + if ( + hm.parameters["min threshold"] is not None + or hm.parameters["max threshold"] is not None + ): + filterHeatmapValues( + hm, hm.parameters["min threshold"], hm.parameters["max threshold"] + ) if args.kmeans is not None: - hm.matrix.hmcluster(args.kmeans, method='kmeans', clustering_samples=args.clusterUsingSamples) + hm.matrix.hmcluster( + args.kmeans, method="kmeans", clustering_samples=args.clusterUsingSamples + ) else: if args.hclust is not None: - print("Performing hierarchical clustering." - "Please note that it might be very slow for large datasets.\n") - hm.matrix.hmcluster(args.hclust, method='hierarchical', clustering_samples=args.clusterUsingSamples) - - group_len_ratio = np.diff(hm.matrix.group_boundaries) / float(len(hm.matrix.regions)) + print( + "Performing hierarchical clustering." + "Please note that it might be very slow for large datasets.\n" + ) + hm.matrix.hmcluster( + args.hclust, + method="hierarchical", + clustering_samples=args.clusterUsingSamples, + ) + + group_len_ratio = np.diff(hm.matrix.group_boundaries) / float( + len(hm.matrix.regions) + ) if np.any(group_len_ratio < 5.0 / 1000): problem = np.flatnonzero(group_len_ratio < 5.0 / 1000) - sys.stderr.write("WARNING: Group '{}' is too small for plotting, you might want to remove it. \n".format(hm.matrix.group_labels[problem[0]])) + sys.stderr.write( + "WARNING: Group '{}' is too small for plotting, you might want to remove it. \n".format( + hm.matrix.group_labels[problem[0]] + ) + ) if args.regionsLabel: hm.matrix.set_group_labels(args.regionsLabel) @@ -948,36 +1149,43 @@ def main(args=None): hm.matrix.set_sample_labels(args.samplesLabel) if args.outFileNameData: - hm.save_tabulated_values(args.outFileNameData, reference_point_label=args.refPointLabel, - start_label=args.startLabel, - end_label=args.endLabel, - averagetype=args.averageType) + hm.save_tabulated_values( + args.outFileNameData, + reference_point_label=args.refPointLabel, + start_label=args.startLabel, + end_label=args.endLabel, + averagetype=args.averageType, + ) if args.outFileSortedRegions: hm.save_BED(args.outFileSortedRegions) - prof = Profile(hm, args.outFileName, - plot_title=args.plotTitle, - y_axis_label=args.yAxisLabel, - y_min=args.yMin, y_max=args.yMax, - averagetype=args.averageType, - reference_point_label=args.refPointLabel, - start_label=args.startLabel, - end_label=args.endLabel, - plot_height=args.plotHeight, - plot_width=args.plotWidth, - per_group=args.perGroup, - plot_type=args.plotType, - image_format=args.plotFileFormat, - color_list=args.colors, - legend_location=args.legendLocation, - plots_per_row=args.numPlotsPerRow, - label_rotation=args.label_rotation, - dpi=args.dpi) - - if args.plotType == 'heatmap': + prof = Profile( + hm, + args.outFileName, + plot_title=args.plotTitle, + y_axis_label=args.yAxisLabel, + y_min=args.yMin, + y_max=args.yMax, + averagetype=args.averageType, + reference_point_label=args.refPointLabel, + start_label=args.startLabel, + end_label=args.endLabel, + plot_height=args.plotHeight, + plot_width=args.plotWidth, + per_group=args.perGroup, + plot_type=args.plotType, + image_format=args.plotFileFormat, + color_list=args.colors, + legend_location=args.legendLocation, + plots_per_row=args.numPlotsPerRow, + label_rotation=args.label_rotation, + dpi=args.dpi, + ) + + if args.plotType == "heatmap": prof.plot_heatmap() - elif args.plotType == 'overlapped_lines': + elif args.plotType == "overlapped_lines": prof.plot_hexbin() else: prof.plot_profile() diff --git a/deeptools/sumCoveragePerBin.py b/deeptools/sumCoveragePerBin.py index 705da7150..33effe767 100644 --- a/deeptools/sumCoveragePerBin.py +++ b/deeptools/sumCoveragePerBin.py @@ -11,8 +11,10 @@ class SumCoveragePerBin(countReadsPerBin.CountReadsPerBin): r"""This is an extension of CountReadsPerBin for use with plotFingerprint. There, we need to sum the per-base coverage. """ - def get_coverage_of_region(self, bamHandle, chrom, regions, - fragmentFromRead_func=None): + + def get_coverage_of_region( + self, bamHandle, chrom, regions, fragmentFromRead_func=None + ): """ Returns a numpy array that corresponds to the number of reads that overlap with each tile. @@ -44,9 +46,9 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, nbins = 0 for reg in regions: nbins += (reg[1] - reg[0]) // reg[2] - coverages = np.zeros(nbins, dtype='float64') + coverages = np.zeros(nbins, dtype="float64") - if self.defaultFragmentLength == 'read length': + if self.defaultFragmentLength == "read length": extension = 0 else: extension = self.maxPairedFragmentLength @@ -90,13 +92,22 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, except: # bigWig input, as used by plotFingerprint if bamHandle.chroms(chrom): - _ = np.array(bamHandle.stats(chrom, regStart, regEnd, type="mean", nBins=nRegBins), dtype=np.float) + _ = np.array( + bamHandle.stats( + chrom, regStart, regEnd, type="mean", nBins=nRegBins + ), + dtype=np.float, + ) _[np.isnan(_)] = 0.0 _ = _ * tileSize coverages += _ continue else: - raise NameError("chromosome {} not found in bigWig file with chroms {}".format(chrom, bamHandle.chroms())) + raise NameError( + "chromosome {} not found in bigWig file with chroms {}".format( + chrom, bamHandle.chroms() + ) + ) prev_pos = set() lpos = None @@ -108,7 +119,10 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, continue # filter reads based on SAM flag - if self.samFlag_include and read.flag & self.samFlag_include != self.samFlag_include: + if ( + self.samFlag_include + and read.flag & self.samFlag_include != self.samFlag_include + ): continue if self.samFlag_exclude and read.flag & self.samFlag_exclude != 0: continue @@ -132,8 +146,11 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, e = s - tLen if read.reference_id != read.next_reference_id: e = read.pnext - if lpos is not None and lpos == read.reference_start \ - and (s, e, read.next_reference_id, read.is_reverse) in prev_pos: + if ( + lpos is not None + and lpos == read.reference_start + and (s, e, read.next_reference_id, read.is_reverse) in prev_pos + ): continue if lpos != read.reference_start: prev_pos.clear() @@ -167,7 +184,10 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, fragmentEnd = reg[0] + len(coverages) * tileSize sIdx = vector_start + max((fragmentStart - reg[0]) // tileSize, 0) - eIdx = vector_start + min(np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype('int'), nRegBins) + eIdx = vector_start + min( + np.ceil(float(fragmentEnd - reg[0]) / tileSize).astype("int"), + nRegBins, + ) if eIdx >= len(coverages): eIdx = len(coverages) - 1 if last_eIdx is not None: @@ -202,8 +222,17 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, if self.verbose: endTime = time.time() - print("%s, processing %s (%.1f per sec) reads @ %s:%s-%s" % ( - multiprocessing.current_process().name, c, c / (endTime - start_time), chrom, reg[0], reg[1])) + print( + "%s, processing %s (%.1f per sec) reads @ %s:%s-%s" + % ( + multiprocessing.current_process().name, + c, + c / (endTime - start_time), + chrom, + reg[0], + reg[1], + ) + ) vector_start += nRegBins @@ -215,7 +244,6 @@ def get_coverage_of_region(self, bamHandle, chrom, regions, class Tester(object): - def __init__(self): """ The distribution of reads between the two bam files is as follows. @@ -233,8 +261,9 @@ def __init__(self): =============== """ import os + self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/" self.bamFile1 = self.root + "testA.bam" self.bamFile2 = self.root + "testB.bam" self.bamFile_PE = self.root + "test_paired2.bam" - self.chrom = '3R' + self.chrom = "3R" diff --git a/deeptools/test/test_bamCoverage_and_bamCompare.py b/deeptools/test/test_bamCoverage_and_bamCompare.py index f0c8d40af..c63402215 100644 --- a/deeptools/test/test_bamCoverage_and_bamCompare.py +++ b/deeptools/test/test_bamCoverage_and_bamCompare.py @@ -37,73 +37,80 @@ def test_bam_coverage_arguments(): """ Test minimal command line args for bamCoverage """ - outfile = '/tmp/test_file.bg' + outfile = "/tmp/test_file.bg" for fname in [BAMFILE_B, CRAMFILE_B]: args = "--bam {} -o {} --outFileFormat bedgraph".format(fname, outfile).split() bam_cov.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t0\t50\t0\n', '3R\t50\t150\t1\n', '3R\t150\t200\t2\n'] + expected = ["3R\t0\t50\t0\n", "3R\t50\t150\t1\n", "3R\t150\t200\t2\n"] assert_equal(resp, expected) unlink(outfile) def test_bam_coverage_extend(): - outfile = '/tmp/test_file.bg' + outfile = "/tmp/test_file.bg" for fname in [BAMFILE_B, CRAMFILE_B]: - args = "-b {} -o {} --extendReads 100 --outFileFormat bedgraph".format(fname, outfile).split() + args = "-b {} -o {} --extendReads 100 --outFileFormat bedgraph".format( + fname, outfile + ).split() bam_cov.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t0\t150\t1\n', '3R\t150\t200\t3\n'] + expected = ["3R\t0\t150\t1\n", "3R\t150\t200\t3\n"] assert_equal(resp, expected) unlink(outfile) def test_bam_coverage_extend_and_normalizeUsingRPGC(): - - outfile = '/tmp/test_file.bg' + outfile = "/tmp/test_file.bg" for fname in [BAMFILE_B, CRAMFILE_B]: - args = "-b {} -o {} --normalizeUsing RPGC --effectiveGenomeSize 200 --extendReads 100 " \ - "--outFileFormat bedgraph".format(fname, outfile).split() + args = ( + "-b {} -o {} --normalizeUsing RPGC --effectiveGenomeSize 200 --extendReads 100 " + "--outFileFormat bedgraph".format(fname, outfile).split() + ) bam_cov.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() # the scale factor should be 0.5, thus the result is similar to # that of the previous test divided by 0.5 - expected = ['3R\t0\t150\t0.5\n', '3R\t150\t200\t1.5\n'] + expected = ["3R\t0\t150\t0.5\n", "3R\t150\t200\t1.5\n"] assert_equal(resp, expected) unlink(outfile) def test_bam_coverage_skipnas(): - outfile = '/tmp/test_file.bg' + outfile = "/tmp/test_file.bg" for fname in [BAMFILE_B, CRAMFILE_B]: - args = "--bam {} -o {} --outFileFormat bedgraph --skipNAs".format(fname, outfile).split() + args = "--bam {} -o {} --outFileFormat bedgraph --skipNAs".format( + fname, outfile + ).split() bam_cov.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t50\t150\t1\n', '3R\t150\t200\t2\n'] + expected = ["3R\t50\t150\t1\n", "3R\t150\t200\t2\n"] assert_equal(resp, expected) unlink(outfile) def test_bam_coverage_filtering(): - outfile = '/tmp/test_file.bg' + outfile = "/tmp/test_file.bg" for fname in [BAMFILE_B, CRAMFILE_B]: - args = "--bam {} -o {} --outFileFormat bedgraph --ignoreDuplicates --verbose".format(fname, outfile).split() + args = "--bam {} -o {} --outFileFormat bedgraph --ignoreDuplicates --verbose".format( + fname, outfile + ).split() bam_cov.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t0\t50\t0\n', '3R\t50\t200\t1\n'] + expected = ["3R\t0\t50\t0\n", "3R\t50\t200\t1\n"] assert_equal(resp, expected) unlink(outfile) @@ -114,16 +121,20 @@ def test_bam_compare_arguments(): between the same file is taken, therefore, the expected value is 1.0 for all bins. """ - outfile = '/tmp/test_file.bg' + outfile = "/tmp/test_file.bg" for fname in [BAMFILE_B, CRAMFILE_B]: - args = "--bamfile1 {} --bamfile2 {} " \ - "-o {} -p 1 --outFileFormat bedgraph --operation ratio".format(fname, fname, outfile).split() + args = ( + "--bamfile1 {} --bamfile2 {} " + "-o {} -p 1 --outFileFormat bedgraph --operation ratio".format( + fname, fname, outfile + ).split() + ) bam_comp.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t0\t200\t1\n'] + expected = ["3R\t0\t200\t1\n"] assert_equal(resp, expected) unlink(outfile) @@ -132,16 +143,23 @@ def test_bam_compare_diff_files(): """ Test with two different files """ - outfile = '/tmp/test_file.bg' + outfile = "/tmp/test_file.bg" for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]: - args = "--bamfile1 {} --bamfile2 {} --scaleFactors 1:1 --operation subtract " \ - "-o {} -p 1 --outFileFormat bedgraph".format(A, B, outfile).split() + args = ( + "--bamfile1 {} --bamfile2 {} --scaleFactors 1:1 --operation subtract " + "-o {} -p 1 --outFileFormat bedgraph".format(A, B, outfile).split() + ) bam_comp.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t0\t50\t0\n', '3R\t50\t100\t-1\n', '3R\t100\t150\t0\n', '3R\t150\t200\t-1\n'] + expected = [ + "3R\t0\t50\t0\n", + "3R\t50\t100\t-1\n", + "3R\t100\t150\t0\n", + "3R\t150\t200\t-1\n", + ] assert_equal(resp, expected) unlink(outfile) @@ -150,15 +168,22 @@ def test_bam_compare_pseudocounts(): """ Test with different pseudocounts """ - outfile = '/tmp/test_file.bg' - args = "--bamfile1 {} --bamfile2 {} --outFileFormat bedgraph --scaleFactors 1:1 -o {} " \ - "--pseudocount 1 0".format(BAMFILE_A, BAMFILE_B, outfile).split() + outfile = "/tmp/test_file.bg" + args = ( + "--bamfile1 {} --bamfile2 {} --outFileFormat bedgraph --scaleFactors 1:1 -o {} " + "--pseudocount 1 0".format(BAMFILE_A, BAMFILE_B, outfile).split() + ) bam_comp.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t0\t50\tinf\n', '3R\t50\t100\t0\n', '3R\t100\t150\t1\n', '3R\t150\t200\t0\n'] + expected = [ + "3R\t0\t50\tinf\n", + "3R\t50\t100\t0\n", + "3R\t100\t150\t1\n", + "3R\t150\t200\t0\n", + ] assert_equal(resp, expected) unlink(outfile) @@ -167,15 +192,17 @@ def test_bam_compare_ZoverZ(): """ Ensure --skipZeroOverZero works in bamCompare """ - outfile = '/tmp/test_file.bg' - args = "--bamfile1 {} --bamfile2 {} --outFileFormat bedgraph --scaleFactors 1:1 -o {} " \ - "--skipZeroOverZero".format(BAMFILE_A, BAMFILE_B, outfile).split() + outfile = "/tmp/test_file.bg" + args = ( + "--bamfile1 {} --bamfile2 {} --outFileFormat bedgraph --scaleFactors 1:1 -o {} " + "--skipZeroOverZero".format(BAMFILE_A, BAMFILE_B, outfile).split() + ) bam_comp.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t50\t100\t-1\n', '3R\t100\t150\t0\n', '3R\t150\t200\t-0.584963\n'] + expected = ["3R\t50\t100\t-1\n", "3R\t100\t150\t0\n", "3R\t150\t200\t-0.584963\n"] assert_equal(resp, expected) unlink(outfile) @@ -195,15 +222,21 @@ def test_get_num_kept_reads(): assert total_reads == 3, "num total reads is wrong" # ignore chr_cigar to count the total number of reads - args = "--bam {} --ignoreForNormalization chr_cigar -o /tmp/test".format(fname).split() + args = "--bam {} --ignoreForNormalization chr_cigar -o /tmp/test".format( + fname + ).split() args = bam_cov.process_args(args) num_kept_reads, total_reads = gs.get_num_kept_reads(args, None) # the number of kept reads should be 2 as the read on chr_cigar is skipped - assert num_kept_reads == 2, "num_kept_reads is wrong ({})".format(num_kept_reads) + assert num_kept_reads == 2, "num_kept_reads is wrong ({})".format( + num_kept_reads + ) # test filtering by read direction. Only forward reads are kept - args = "--bam {} -o /tmp/test --samFlagExclude 16 --ignoreForNormalization chr_cigar ".format(fname).split() + args = "--bam {} -o /tmp/test --samFlagExclude 16 --ignoreForNormalization chr_cigar ".format( + fname + ).split() args = bam_cov.process_args(args) num_kept_reads, total_reads = gs.get_num_kept_reads(args, None) @@ -218,16 +251,20 @@ def test_bam_compare_diff_files_skipnas(): Compared to the previous tests, any region that do not have coverage (in either of the bam files) is not included in the bedgraph file. """ - outfile = '/tmp/test_file.bg' + outfile = "/tmp/test_file.bg" for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]: - args = "--bamfile1 {} --bamfile2 {} --scaleFactors 1:1 --operation subtract " \ - "-o {} -p 1 --outFileFormat bedgraph --skipNAs".format(A, B, outfile).split() + args = ( + "--bamfile1 {} --bamfile2 {} --scaleFactors 1:1 --operation subtract " + "-o {} -p 1 --outFileFormat bedgraph --skipNAs".format( + A, B, outfile + ).split() + ) bam_comp.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t100\t150\t0\n', '3R\t150\t200\t-1\n'] + expected = ["3R\t100\t150\t0\n", "3R\t150\t200\t-1\n"] assert_equal(resp, expected) unlink(outfile) @@ -236,16 +273,18 @@ def test_bam_compare_extend(): """ Test read extension """ - outfile = '/tmp/test_file.bg' + outfile = "/tmp/test_file.bg" for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]: - args = "--bamfile1 {} --bamfile2 {} --extend 100 --scaleFactors 1:1 --operation subtract " \ - "-o {} -p 1 --outFileFormat bedgraph".format(A, B, outfile).split() + args = ( + "--bamfile1 {} --bamfile2 {} --extend 100 --scaleFactors 1:1 --operation subtract " + "-o {} -p 1 --outFileFormat bedgraph".format(A, B, outfile).split() + ) bam_comp.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t0\t100\t-1\n', '3R\t100\t150\t1\n', '3R\t150\t200\t-1\n'] + expected = ["3R\t0\t100\t-1\n", "3R\t100\t150\t1\n", "3R\t150\t200\t-1\n"] assert_equal(resp, expected) unlink(outfile) @@ -254,15 +293,17 @@ def test_bam_compare_scale_factors_ratio(): """ Test scale factor """ - outfile = '/tmp/test_file.bg' + outfile = "/tmp/test_file.bg" for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]: - args = "--bamfile1 {} --bamfile2 {} --operation ratio --ignoreForNormalization chr_cigar " \ - "-o {} -p 1 --outFileFormat bedgraph".format(A, B, outfile).split() + args = ( + "--bamfile1 {} --bamfile2 {} --operation ratio --ignoreForNormalization chr_cigar " + "-o {} -p 1 --outFileFormat bedgraph".format(A, B, outfile).split() + ) bam_comp.main(args) # The scale factors are [ 1. 0.5] because BAMFILE_B has double the amount of reads (4) compared to BAMFILE_A - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() @@ -285,7 +326,12 @@ def test_bam_compare_scale_factors_ratio(): (scale factors [1,0.5]) (1+1)/(1+1*0.5)=1.33 """ - expected = ['3R\t0\t50\t1\n', '3R\t50\t100\t0.666667\n', '3R\t100\t150\t1.33333\n', '3R\t150\t200\t1\n'] + expected = [ + "3R\t0\t50\t1\n", + "3R\t50\t100\t0.666667\n", + "3R\t100\t150\t1.33333\n", + "3R\t150\t200\t1\n", + ] assert_equal(resp, expected) unlink(outfile) @@ -294,15 +340,19 @@ def test_bam_compare_scale_factors_subtract(): """ Test scale factor """ - outfile = '/tmp/test_file.bg' + outfile = "/tmp/test_file.bg" for A, B in [(BAMFILE_A, BAMFILE_B), (CRAMFILE_A, CRAMFILE_B)]: - args = "--bamfile1 {} --bamfile2 {} --operation subtract --ignoreForNormalization chr_cigar " \ - "-o {} -p 1 --outFileFormat bedgraph --scaleFactorsMethod None --normalizeUsing CPM".format(A, B, outfile).split() + args = ( + "--bamfile1 {} --bamfile2 {} --operation subtract --ignoreForNormalization chr_cigar " + "-o {} -p 1 --outFileFormat bedgraph --scaleFactorsMethod None --normalizeUsing CPM".format( + A, B, outfile + ).split() + ) bam_comp.main(args) # The scale factors are [ 1. 0.5] because BAMFILE_B has dowble the amount of reads (4) compared to BAMFILE_A - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() @@ -327,7 +377,12 @@ def test_bam_compare_scale_factors_subtract(): """ - expected = ['3R\t0\t50\t0\n', '3R\t50\t100\t-250000\n', '3R\t100\t150\t250000\n', '3R\t150\t200\t0\n'] + expected = [ + "3R\t0\t50\t0\n", + "3R\t50\t100\t-250000\n", + "3R\t100\t150\t250000\n", + "3R\t150\t200\t0\n", + ] assert_equal(resp, expected) unlink(outfile) @@ -336,27 +391,39 @@ def test_bam_coverage_filter_blacklist(): """ Test --samFlagInclude --samFlagExclude --minMappingQuality --ignoreDuplicates and --blackListFileName """ - outfile = '/tmp/test_file_filter.bg' + outfile = "/tmp/test_file_filter.bg" for fname in [BAMFILE_FILTER1, CRAMFILE_FILTER1]: - args = "--bam {} --normalizeUsing RPGC --effectiveGenomeSize 1400 -p 1 -o {} -of bedgraph --samFlagInclude 512 " \ - "--samFlagExclude 256 --minMappingQuality 5 --ignoreDuplicates " \ - "--blackListFileName {}".format(fname, outfile, BEDFILE_FILTER) + args = ( + "--bam {} --normalizeUsing RPGC --effectiveGenomeSize 1400 -p 1 -o {} -of bedgraph --samFlagInclude 512 " + "--samFlagExclude 256 --minMappingQuality 5 --ignoreDuplicates " + "--blackListFileName {}".format(fname, outfile, BEDFILE_FILTER) + ) args = args.split() bam_cov.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t0\t100\t0\n', '3R\t100\t150\t1.42338\n', - '3R\t150\t250\t4.88017\n', '3R\t250\t300\t3.05011\n', - '3R\t300\t400\t2.23675\n', '3R\t400\t450\t3.86347\n', - '3R\t450\t500\t4.06681\n', '3R\t500\t550\t2.03341\n', - '3R\t550\t600\t2.44009\n', '3R\t600\t650\t4.47349\n', - '3R\t650\t700\t3.45679\n', '3R\t700\t750\t3.66013\n', - '3R\t750\t800\t4.06681\n', '3R\t900\t950\t2.44009\n', - '3R\t950\t1000\t1.62672\n', '3R\t1000\t1050\t0.813362\n', - '3R\t1050\t1500\t0\n'] + expected = [ + "3R\t0\t100\t0\n", + "3R\t100\t150\t1.42338\n", + "3R\t150\t250\t4.88017\n", + "3R\t250\t300\t3.05011\n", + "3R\t300\t400\t2.23675\n", + "3R\t400\t450\t3.86347\n", + "3R\t450\t500\t4.06681\n", + "3R\t500\t550\t2.03341\n", + "3R\t550\t600\t2.44009\n", + "3R\t600\t650\t4.47349\n", + "3R\t650\t700\t3.45679\n", + "3R\t700\t750\t3.66013\n", + "3R\t750\t800\t4.06681\n", + "3R\t900\t950\t2.44009\n", + "3R\t950\t1000\t1.62672\n", + "3R\t1000\t1050\t0.813362\n", + "3R\t1050\t1500\t0\n", + ] assert_equal(resp, expected) unlink(outfile) @@ -366,7 +433,7 @@ def test_bam_coverage_offset1(): """ Test -bs 1 --Offset 1 """ - outfile = '/tmp/test_offset.bw' + outfile = "/tmp/test_offset.bw" for fname in [BAMFILE_A, CRAMFILE_A]: args = "--Offset 1 --bam {} -p 1 -bs 1 -o {}".format(fname, outfile) args = args.split() @@ -376,7 +443,7 @@ def test_bam_coverage_offset1(): filecmp.clear_cache() except: pass - assert(filecmp.cmp(outfile, "{}testA_offset1.bw".format(ROOT)) is True) + assert filecmp.cmp(outfile, "{}testA_offset1.bw".format(ROOT)) is True unlink(outfile) @@ -384,7 +451,7 @@ def test_bam_coverage_offset1_10(): """ Test -bs 1 --Offset 1 10 """ - outfile = '/tmp/test_offset.bw' + outfile = "/tmp/test_offset.bw" for fname in [BAMFILE_A, CRAMFILE_A]: args = "--Offset 1 10 -b {} -p 1 -bs 1 -o {}".format(fname, outfile) args = args.split() @@ -394,7 +461,7 @@ def test_bam_coverage_offset1_10(): filecmp.clear_cache() except: pass - assert(filecmp.cmp(outfile, "{}testA_offset1_10.bw".format(ROOT)) is True) + assert filecmp.cmp(outfile, "{}testA_offset1_10.bw".format(ROOT)) is True unlink(outfile) @@ -402,7 +469,7 @@ def test_bam_coverage_offset_minus1(): """ Test -bs 1 --Offset -1 """ - outfile = '/tmp/test_offset.bw' + outfile = "/tmp/test_offset.bw" for fname in [BAMFILE_A, CRAMFILE_A]: args = "--Offset -1 -b {} -p 1 -bs 1 -o {}".format(fname, outfile) args = args.split() @@ -412,7 +479,7 @@ def test_bam_coverage_offset_minus1(): filecmp.clear_cache() except: pass - assert(filecmp.cmp(outfile, "{}testA_offset-1.bw".format(ROOT)) is True) + assert filecmp.cmp(outfile, "{}testA_offset-1.bw".format(ROOT)) is True unlink(outfile) @@ -420,7 +487,7 @@ def test_bam_coverage_offset20_minus4(): """ Test -bs 1 --Offset 20 -4 """ - outfile = '/tmp/test_offset.bw' + outfile = "/tmp/test_offset.bw" for fname in [BAMFILE_A, CRAMFILE_A]: args = "--Offset 20 -4 -b {} -p 1 -bs 1 -o {}".format(fname, outfile) args = args.split() @@ -430,7 +497,7 @@ def test_bam_coverage_offset20_minus4(): filecmp.clear_cache() except: pass - assert(filecmp.cmp(outfile, "{}testA_offset20_-4.bw".format(ROOT)) is True) + assert filecmp.cmp(outfile, "{}testA_offset20_-4.bw".format(ROOT)) is True unlink(outfile) @@ -438,26 +505,42 @@ def test_bam_compare_filter_blacklist(): """ Test --samFlagInclude --samFlagExclude --minMappingQuality --ignoreDuplicates and --blackListFileName """ - outfile = '/tmp/test_file_filter.bg' - for A, B in [(BAMFILE_FILTER1, BAMFILE_FILTER2), (CRAMFILE_FILTER1, CRAMFILE_FILTER2)]: - args = "-b1 {} -b2 {} -p 1 -o {} -of bedgraph --samFlagInclude 512 " \ - "--samFlagExclude 256 --minMappingQuality 5 --ignoreDuplicates " \ - "--blackListFileName {}".format(A, B, outfile, BEDFILE_FILTER) + outfile = "/tmp/test_file_filter.bg" + for A, B in [ + (BAMFILE_FILTER1, BAMFILE_FILTER2), + (CRAMFILE_FILTER1, CRAMFILE_FILTER2), + ]: + args = ( + "-b1 {} -b2 {} -p 1 -o {} -of bedgraph --samFlagInclude 512 " + "--samFlagExclude 256 --minMappingQuality 5 --ignoreDuplicates " + "--blackListFileName {}".format(A, B, outfile, BEDFILE_FILTER) + ) args = args.split() bam_comp.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t0\t100\t0\n', '3R\t100\t150\t-0.220909\n', - '3R\t150\t200\t-0.159356\n', '3R\t200\t250\t-0.0718929\n', - '3R\t250\t300\t0.135883\n', '3R\t300\t350\t0.103093\n', - '3R\t350\t400\t-0.0895516\n', '3R\t400\t450\t0.0308374\n', - '3R\t450\t500\t0.0989418\n', '3R\t500\t550\t0.207044\n', - '3R\t550\t600\t0.0198996\n', '3R\t600\t650\t-0.0957241\n', - '3R\t650\t700\t0.00968255\n', '3R\t700\t750\t-0.040642\n', - '3R\t750\t800\t-0.123451\n', '3R\t900\t950\t0.212545\n', - '3R\t950\t1000\t0.199309\n', '3R\t1000\t1050\t0.167945\n', - '3R\t1050\t1500\t0\n'] + expected = [ + "3R\t0\t100\t0\n", + "3R\t100\t150\t-0.220909\n", + "3R\t150\t200\t-0.159356\n", + "3R\t200\t250\t-0.0718929\n", + "3R\t250\t300\t0.135883\n", + "3R\t300\t350\t0.103093\n", + "3R\t350\t400\t-0.0895516\n", + "3R\t400\t450\t0.0308374\n", + "3R\t450\t500\t0.0989418\n", + "3R\t500\t550\t0.207044\n", + "3R\t550\t600\t0.0198996\n", + "3R\t600\t650\t-0.0957241\n", + "3R\t650\t700\t0.00968255\n", + "3R\t700\t750\t-0.040642\n", + "3R\t750\t800\t-0.123451\n", + "3R\t900\t950\t0.212545\n", + "3R\t950\t1000\t0.199309\n", + "3R\t1000\t1050\t0.167945\n", + "3R\t1050\t1500\t0\n", + ] assert_equal(resp, expected) unlink(outfile) diff --git a/deeptools/test/test_bigwigCompare_and_multiBigwigSummary.py b/deeptools/test/test_bigwigCompare_and_multiBigwigSummary.py index 831924277..0c9dac053 100644 --- a/deeptools/test/test_bigwigCompare_and_multiBigwigSummary.py +++ b/deeptools/test/test_bigwigCompare_and_multiBigwigSummary.py @@ -38,54 +38,66 @@ def test_bigwigCompare(): - outfile = '/tmp/result.bg' - args = "-b1 {} -b2 {} -o {} --operation add --outFileFormat bedgraph".format(BIGWIG_A, BIGWIG_B, outfile).split() + outfile = "/tmp/result.bg" + args = "-b1 {} -b2 {} -o {} --operation add --outFileFormat bedgraph".format( + BIGWIG_A, BIGWIG_B, outfile + ).split() bwComp.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t0\t50\t0\n', '3R\t50\t100\t1\n', '3R\t100\t150\t2\n', '3R\t150\t200\t3\n'] + expected = [ + "3R\t0\t50\t0\n", + "3R\t50\t100\t1\n", + "3R\t100\t150\t2\n", + "3R\t150\t200\t3\n", + ] assert resp == expected, "{} != {}".format(resp, expected) unlink(outfile) def test_bigwigCompare_skipnas(): - outfile = '/tmp/result.bg' - args = "-b1 {} -b2 {} -o {} --operation add --skipNAs " \ - "--outFileFormat bedgraph".format(BIGWIG_A, BIGWIG_B, outfile).split() + outfile = "/tmp/result.bg" + args = ( + "-b1 {} -b2 {} -o {} --operation add --skipNAs " + "--outFileFormat bedgraph".format(BIGWIG_A, BIGWIG_B, outfile).split() + ) bwComp.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t100\t150\t2\n', '3R\t150\t200\t3\n'] + expected = ["3R\t100\t150\t2\n", "3R\t150\t200\t3\n"] assert resp == expected, "{} != {}".format(resp, expected) unlink(outfile) def test_bigwigCompare_skipZeroOverZero(): outfile = '/tmp/result.bg"' - args = "-b1 {} -b2 {} -o {} --skipZeroOverZero --pseudocount 1 3 --outFileFormat bedgraph".format(BIGWIG_A, BIGWIG_A, outfile).split() + args = "-b1 {} -b2 {} -o {} --skipZeroOverZero --pseudocount 1 3 --outFileFormat bedgraph".format( + BIGWIG_A, BIGWIG_A, outfile + ).split() bwComp.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['3R\t100\t200\t-1\n'] + expected = ["3R\t100\t200\t-1\n"] assert resp == expected, "{} != {}".format(resp, expected) unlink(outfile) def test_multiBigwigSummary(): - outfile = '/tmp/result.bg' - args = "bins -b {} {} --binSize 50 -o {}".format(BIGWIG_A, BIGWIG_B, outfile).split() + outfile = "/tmp/result.bg" + args = "bins -b {} {} --binSize 50 -o {}".format( + BIGWIG_A, BIGWIG_B, outfile + ).split() bwCorr.main(args) resp = np.load(outfile) - matrix = resp['matrix'] - labels = resp['labels'] - nt.assert_equal(matrix, np.array([[np.nan, np.nan], - [np.nan, 1.], - [1., 1.], - [1., 2.]])) - nt.assert_equal(labels, ['testA_skipNAs.bw', 'testB_skipNAs.bw']) + matrix = resp["matrix"] + labels = resp["labels"] + nt.assert_equal( + matrix, np.array([[np.nan, np.nan], [np.nan, 1.0], [1.0, 1.0], [1.0, 2.0]]) + ) + nt.assert_equal(labels, ["testA_skipNAs.bw", "testB_skipNAs.bw"]) unlink(outfile) @@ -93,10 +105,12 @@ def test_multiBigwigSummary_outrawcounts(): """ Test multiBigwigSummary raw counts output """ - outfile = '/tmp/result.bg' - args = "bins -b {} {} --binSize 50 -o /tmp/null --outRawCounts {} ".format(BIGWIG_A, BIGWIG_B, outfile).split() + outfile = "/tmp/result.bg" + args = "bins -b {} {} --binSize 50 -o /tmp/null --outRawCounts {} ".format( + BIGWIG_A, BIGWIG_B, outfile + ).split() bwCorr.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.read() _foo.close() expected = """#'chr' 'start' 'end' 'testA_skipNAs.bw' 'testB_skipNAs.bw' @@ -111,26 +125,30 @@ def test_multiBigwigSummary_outrawcounts(): def test_multiBigwigSummary_gtf(): - outfile = '/tmp/_test.npz' - args = "BED-file -b {0} {0} --BED {1}/test.gtf -o {2}".format(BIGWIG_C, ROOT, outfile).split() + outfile = "/tmp/_test.npz" + args = "BED-file -b {0} {0} --BED {1}/test.gtf -o {2}".format( + BIGWIG_C, ROOT, outfile + ).split() bwCorr.main(args) resp = np.load(outfile) - matrix = resp['matrix'] - labels = resp['labels'] - nt.assert_equal(labels, ['test1.bw.bw', 'test1.bw.bw']) - nt.assert_allclose(matrix, np.array([[27.475, 27.475], - [27.31248719, 27.31248719]])) + matrix = resp["matrix"] + labels = resp["labels"] + nt.assert_equal(labels, ["test1.bw.bw", "test1.bw.bw"]) + nt.assert_allclose(matrix, np.array([[27.475, 27.475], [27.31248719, 27.31248719]])) unlink(outfile) def test_multiBigwigSummary_metagene(): - outfile = '/tmp/_test.npz' - args = "BED-file --metagene -b {0} {0} --BED {1}/test.gtf -o {2}".format(BIGWIG_C, ROOT, outfile).split() + outfile = "/tmp/_test.npz" + args = "BED-file --metagene -b {0} {0} --BED {1}/test.gtf -o {2}".format( + BIGWIG_C, ROOT, outfile + ).split() bwCorr.main(args) resp = np.load(outfile) - matrix = resp['matrix'] - labels = resp['labels'] - nt.assert_equal(labels, ['test1.bw.bw', 'test1.bw.bw']) - nt.assert_allclose(matrix, np.array([[20.28956028, 20.28956028], - [22.1923501, 22.1923501]])) + matrix = resp["matrix"] + labels = resp["labels"] + nt.assert_equal(labels, ["test1.bw.bw", "test1.bw.bw"]) + nt.assert_allclose( + matrix, np.array([[20.28956028, 20.28956028], [22.1923501, 22.1923501]]) + ) unlink(outfile) diff --git a/deeptools/test/test_computeMatrixOperations.py b/deeptools/test/test_computeMatrixOperations.py index dda3cb3f9..a7f26368e 100644 --- a/deeptools/test/test_computeMatrixOperations.py +++ b/deeptools/test/test_computeMatrixOperations.py @@ -6,7 +6,7 @@ import gzip import json -__author__ = 'Devon' +__author__ = "Devon" ROOT = os.path.dirname(os.path.abspath(__file__)) + "/test_data/" @@ -31,33 +31,97 @@ def testSubset(self): """ computeMatrixOperations subset """ - dCorrect = {"verbose": True, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0, 0, 0, 0], "body": [1000, 1000, 1000, 1000], "sample_labels": ["SRR648667.forward", "SRR648668.forward", "SRR648669.forward", "SRR648670.forward"], "downstream": [0, 0, 0, 0], "unscaled 3 prime": [0, 0, 0, 0], "group_labels": ["genes"], "bin size": [10, 10, 10, 10], "upstream": [0, 0, 0, 0], "group_boundaries": [0, 196], "sample_boundaries": [0, 100, 200, 300, 400], "max threshold": None, "ref point": [None, None, None, None], "min threshold": None, "sort regions": "no", "proc number": 20, "bin avg type": "mean", "missing data as zero": False} + dCorrect = { + "verbose": True, + "scale": 1, + "skip zeros": False, + "nan after end": False, + "sort using": "mean", + "unscaled 5 prime": [0, 0, 0, 0], + "body": [1000, 1000, 1000, 1000], + "sample_labels": [ + "SRR648667.forward", + "SRR648668.forward", + "SRR648669.forward", + "SRR648670.forward", + ], + "downstream": [0, 0, 0, 0], + "unscaled 3 prime": [0, 0, 0, 0], + "group_labels": ["genes"], + "bin size": [10, 10, 10, 10], + "upstream": [0, 0, 0, 0], + "group_boundaries": [0, 196], + "sample_boundaries": [0, 100, 200, 300, 400], + "max threshold": None, + "ref point": [None, None, None, None], + "min threshold": None, + "sort regions": "no", + "proc number": 20, + "bin avg type": "mean", + "missing data as zero": False, + } oname = "/tmp/subset.mat.gz" - args = "subset -m {} --sample SRR648667.forward SRR648668.forward SRR648669.forward SRR648670.forward -o {}".format(self.matrix, oname) + args = "subset -m {} --sample SRR648667.forward SRR648668.forward SRR648669.forward SRR648670.forward -o {}".format( + self.matrix, oname + ) args = args.split() cmo.main(args) f = gzip.GzipFile(oname) d = getHeader(f) # Skip the header, which can be in a different order h = hashlib.md5(f.read()).hexdigest() f.close() - assert(d == dCorrect) - assert(h == "edb3c8506c3f27ebb8c7ddf94d5ba594") + assert d == dCorrect + assert h == "edb3c8506c3f27ebb8c7ddf94d5ba594" os.remove(oname) def testRelabel(self): """ computeMatrixOperations relabel """ - dCorrect = {"verbose": True, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0], "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], "sample_labels": ["first", "sec ond", "3rd", "4th", "5th", "6th", "7th", "8th"], "downstream": [0, 0, 0, 0, 0, 0, 0, 0], "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0], "group_labels": ["foo bar"], "bin size": [10, 10, 10, 10, 10, 10, 10, 10], "upstream": [0, 0, 0, 0, 0, 0, 0, 0], "group_boundaries": [0, 196], "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800], "max threshold": None, "ref point": [None, None, None, None, None, None, None, None], "min threshold": None, "sort regions": "no", "proc number": 20, "bin avg type": "mean", "missing data as zero": False} + dCorrect = { + "verbose": True, + "scale": 1, + "skip zeros": False, + "nan after end": False, + "sort using": "mean", + "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0], + "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], + "sample_labels": [ + "first", + "sec ond", + "3rd", + "4th", + "5th", + "6th", + "7th", + "8th", + ], + "downstream": [0, 0, 0, 0, 0, 0, 0, 0], + "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0], + "group_labels": ["foo bar"], + "bin size": [10, 10, 10, 10, 10, 10, 10, 10], + "upstream": [0, 0, 0, 0, 0, 0, 0, 0], + "group_boundaries": [0, 196], + "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800], + "max threshold": None, + "ref point": [None, None, None, None, None, None, None, None], + "min threshold": None, + "sort regions": "no", + "proc number": 20, + "bin avg type": "mean", + "missing data as zero": False, + } oname = "/tmp/relabeled.mat.gz" - args = "relabel -m {} -o {} --sampleLabels first sec_ond 3rd 4th 5th 6th 7th 8th --groupLabels foo_bar".format(self.matrix, oname) + args = "relabel -m {} -o {} --sampleLabels first sec_ond 3rd 4th 5th 6th 7th 8th --groupLabels foo_bar".format( + self.matrix, oname + ) args = args.split() - args[7] = 'sec ond' # split mucks up spaces - args[-1] = 'foo bar' + args[7] = "sec ond" # split mucks up spaces + args[-1] = "foo bar" cmo.main(args) f = gzip.GzipFile(oname) d = getHeader(f) - assert(d == dCorrect) + assert d == dCorrect f.close() os.remove(oname) @@ -65,7 +129,39 @@ def testfilterStrand(self): """ computeMatrixOperations filterStrand """ - dCorrect = {"verbose": True, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0], "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], "sample_labels": ["SRR648667.forward", "SRR648668.forward", "SRR648669.forward", "SRR648670.forward", "SRR648667.reverse", "SRR648668.reverse", "SRR648669.reverse", "SRR648670.reverse"], "downstream": [0, 0, 0, 0, 0, 0, 0, 0], "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0], "group_labels": ["genes"], "bin size": [10, 10, 10, 10, 10, 10, 10, 10], "upstream": [0, 0, 0, 0, 0, 0, 0, 0], "group_boundaries": [0, 107], "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800], "max threshold": None, "ref point": [None, None, None, None, None, None, None, None], "min threshold": None, "sort regions": "no", "proc number": 20, "bin avg type": "mean", "missing data as zero": False} + dCorrect = { + "verbose": True, + "scale": 1, + "skip zeros": False, + "nan after end": False, + "sort using": "mean", + "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0], + "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], + "sample_labels": [ + "SRR648667.forward", + "SRR648668.forward", + "SRR648669.forward", + "SRR648670.forward", + "SRR648667.reverse", + "SRR648668.reverse", + "SRR648669.reverse", + "SRR648670.reverse", + ], + "downstream": [0, 0, 0, 0, 0, 0, 0, 0], + "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0], + "group_labels": ["genes"], + "bin size": [10, 10, 10, 10, 10, 10, 10, 10], + "upstream": [0, 0, 0, 0, 0, 0, 0, 0], + "group_boundaries": [0, 107], + "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800], + "max threshold": None, + "ref point": [None, None, None, None, None, None, None, None], + "min threshold": None, + "sort regions": "no", + "proc number": 20, + "bin avg type": "mean", + "missing data as zero": False, + } oname = "/tmp/filterStrand1.mat.gz" args = "filterStrand -m {} -o {} --strand +".format(self.matrix, oname) args = args.split() @@ -74,11 +170,43 @@ def testfilterStrand(self): d = getHeader(f) # Skip the header, which can be in a different order h = hashlib.md5(f.read()).hexdigest() f.close() - assert(d == dCorrect) - assert(h == "300f8000be5b5f51e803b57ef08f1c9e") + assert d == dCorrect + assert h == "300f8000be5b5f51e803b57ef08f1c9e" os.remove(oname) - dCorrect = {u'verbose': True, u'scale': 1, u'skip zeros': False, u'nan after end': False, u'sort using': u'mean', u'unscaled 5 prime': [0, 0, 0, 0, 0, 0, 0, 0], u'body': [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], u'sample_labels': [u'SRR648667.forward', u'SRR648668.forward', u'SRR648669.forward', u'SRR648670.forward', u'SRR648667.reverse', u'SRR648668.reverse', u'SRR648669.reverse', u'SRR648670.reverse'], u'downstream': [0, 0, 0, 0, 0, 0, 0, 0], u'unscaled 3 prime': [0, 0, 0, 0, 0, 0, 0, 0], u'group_labels': [u'genes'], u'bin size': [10, 10, 10, 10, 10, 10, 10, 10], u'upstream': [0, 0, 0, 0, 0, 0, 0, 0], u'group_boundaries': [0, 89], u'sample_boundaries': [0, 100, 200, 300, 400, 500, 600, 700, 800], u'missing data as zero': False, u'ref point': [None, None, None, None, None, None, None, None], u'min threshold': None, u'sort regions': u'no', u'proc number': 20, u'bin avg type': u'mean', u'max threshold': None} + dCorrect = { + "verbose": True, + "scale": 1, + "skip zeros": False, + "nan after end": False, + "sort using": "mean", + "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0], + "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], + "sample_labels": [ + "SRR648667.forward", + "SRR648668.forward", + "SRR648669.forward", + "SRR648670.forward", + "SRR648667.reverse", + "SRR648668.reverse", + "SRR648669.reverse", + "SRR648670.reverse", + ], + "downstream": [0, 0, 0, 0, 0, 0, 0, 0], + "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0], + "group_labels": ["genes"], + "bin size": [10, 10, 10, 10, 10, 10, 10, 10], + "upstream": [0, 0, 0, 0, 0, 0, 0, 0], + "group_boundaries": [0, 89], + "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800], + "missing data as zero": False, + "ref point": [None, None, None, None, None, None, None, None], + "min threshold": None, + "sort regions": "no", + "proc number": 20, + "bin avg type": "mean", + "max threshold": None, + } oname = "/tmp/filterStrand2.mat.gz" args = "filterStrand -m {} -o {} --strand -".format(self.matrix, oname) args = args.split() @@ -87,15 +215,47 @@ def testfilterStrand(self): d = getHeader(f) # Skip the header, which can be in a different order h = hashlib.md5(f.read()).hexdigest() f.close() - assert(d == dCorrect) - assert(h == "0a6ca070a5ba4564f1ab950ac3b7c8f1") + assert d == dCorrect + assert h == "0a6ca070a5ba4564f1ab950ac3b7c8f1" os.remove(oname) def testrbind(self): """ computeMatrixOperations rbind """ - dCorrect = {"verbose": True, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0], "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], "sample_labels": ["SRR648667.forward", "SRR648668.forward", "SRR648669.forward", "SRR648670.forward", "SRR648667.reverse", "SRR648668.reverse", "SRR648669.reverse", "SRR648670.reverse"], "downstream": [0, 0, 0, 0, 0, 0, 0, 0], "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0], "group_labels": ["genes"], "bin size": [10, 10, 10, 10, 10, 10, 10, 10], "upstream": [0, 0, 0, 0, 0, 0, 0, 0], "group_boundaries": [0, 392], "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800], "max threshold": None, "ref point": [None, None, None, None, None, None, None, None], "min threshold": None, "sort regions": "no", "proc number": 20, "bin avg type": "mean", "missing data as zero": False} + dCorrect = { + "verbose": True, + "scale": 1, + "skip zeros": False, + "nan after end": False, + "sort using": "mean", + "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0], + "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], + "sample_labels": [ + "SRR648667.forward", + "SRR648668.forward", + "SRR648669.forward", + "SRR648670.forward", + "SRR648667.reverse", + "SRR648668.reverse", + "SRR648669.reverse", + "SRR648670.reverse", + ], + "downstream": [0, 0, 0, 0, 0, 0, 0, 0], + "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0], + "group_labels": ["genes"], + "bin size": [10, 10, 10, 10, 10, 10, 10, 10], + "upstream": [0, 0, 0, 0, 0, 0, 0, 0], + "group_boundaries": [0, 392], + "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800], + "max threshold": None, + "ref point": [None, None, None, None, None, None, None, None], + "min threshold": None, + "sort regions": "no", + "proc number": 20, + "bin avg type": "mean", + "missing data as zero": False, + } oname = "/tmp/rbind.mat.gz" args = "rbind -m {0} {0} -o {1}".format(self.matrix, oname) args = args.split() @@ -104,32 +264,166 @@ def testrbind(self): d = getHeader(f) # Skip the header, which can be in a different order h = hashlib.md5(f.read()).hexdigest() f.close() - assert(d == dCorrect) - assert(h == "3dd96c7b05e0ca5ada21212defe57fba") + assert d == dCorrect + assert h == "3dd96c7b05e0ca5ada21212defe57fba" os.remove(oname) def testrbind2(self): """ computeMatrixOperations rbind with different groups """ - dCorrect = {"verbose": False, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0], "body": [2], "sample_labels": ["signal"], "downstream": [1], "unscaled 3 prime": [0], "group_labels": ["somegenes", "othergenes"], "bin size": [1], "upstream": [1], "group_boundaries": [0, 3, 7], "sample_boundaries": [0, 4], "max threshold": None, "ref point": [None], "min threshold": None, "sort regions": "keep", "proc number": 1, "bin avg type": "mean", "missing data as zero": True} + dCorrect = { + "verbose": False, + "scale": 1, + "skip zeros": False, + "nan after end": False, + "sort using": "mean", + "unscaled 5 prime": [0], + "body": [2], + "sample_labels": ["signal"], + "downstream": [1], + "unscaled 3 prime": [0], + "group_labels": ["somegenes", "othergenes"], + "bin size": [1], + "upstream": [1], + "group_boundaries": [0, 3, 7], + "sample_boundaries": [0, 4], + "max threshold": None, + "ref point": [None], + "min threshold": None, + "sort regions": "keep", + "proc number": 1, + "bin avg type": "mean", + "missing data as zero": True, + } oname = "/tmp/rbind2.mat.gz" - args = "rbind -m {0} {1} -o {2}".format(self.rbindMatrix1, self.rbindMatrix2, oname) + args = "rbind -m {0} {1} -o {2}".format( + self.rbindMatrix1, self.rbindMatrix2, oname + ) args = args.split() cmo.main(args) f = gzip.GzipFile(oname) d = getHeader(f) # Skip the header, which can be in a different order h = hashlib.md5(f.read()).hexdigest() f.close() - assert(d == dCorrect) - assert(h == "5d8b1517fc4c63d000b6b37f70ee163b") + assert d == dCorrect + assert h == "5d8b1517fc4c63d000b6b37f70ee163b" os.remove(oname) def testcbind(self): """ computeMatrixOperations cbind """ - dCorrect = {"verbose": True, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], "sample_labels": ["SRR648667.forward", "SRR648668.forward", "SRR648669.forward", "SRR648670.forward", "SRR648667.reverse", "SRR648668.reverse", "SRR648669.reverse", "SRR648670.reverse", "SRR648667.forward", "SRR648668.forward", "SRR648669.forward", "SRR648670.forward", "SRR648667.reverse", "SRR648668.reverse", "SRR648669.reverse", "SRR648670.reverse"], "downstream": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "group_labels": ["genes"], "bin size": [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10], "upstream": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "group_boundaries": [0, 196], "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600], "max threshold": None, "ref point": [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None], "min threshold": None, "sort regions": "no", "proc number": 20, "bin avg type": "mean", "missing data as zero": False} + dCorrect = { + "verbose": True, + "scale": 1, + "skip zeros": False, + "nan after end": False, + "sort using": "mean", + "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "body": [ + 1000, + 1000, + 1000, + 1000, + 1000, + 1000, + 1000, + 1000, + 1000, + 1000, + 1000, + 1000, + 1000, + 1000, + 1000, + 1000, + ], + "sample_labels": [ + "SRR648667.forward", + "SRR648668.forward", + "SRR648669.forward", + "SRR648670.forward", + "SRR648667.reverse", + "SRR648668.reverse", + "SRR648669.reverse", + "SRR648670.reverse", + "SRR648667.forward", + "SRR648668.forward", + "SRR648669.forward", + "SRR648670.forward", + "SRR648667.reverse", + "SRR648668.reverse", + "SRR648669.reverse", + "SRR648670.reverse", + ], + "downstream": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "group_labels": ["genes"], + "bin size": [ + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + 10, + ], + "upstream": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "group_boundaries": [0, 196], + "sample_boundaries": [ + 0, + 100, + 200, + 300, + 400, + 500, + 600, + 700, + 800, + 900, + 1000, + 1100, + 1200, + 1300, + 1400, + 1500, + 1600, + ], + "max threshold": None, + "ref point": [ + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + ], + "min threshold": None, + "sort regions": "no", + "proc number": 20, + "bin avg type": "mean", + "missing data as zero": False, + } oname = "/tmp/filterStrand.mat.gz" args = "cbind -m {0} {0} -o {1}".format(self.matrix, oname) args = args.split() @@ -138,15 +432,47 @@ def testcbind(self): d = getHeader(f) # Skip the header, which can be in a different order h = hashlib.md5(f.read()).hexdigest() f.close() - assert(d == dCorrect) - assert(h == "e55d89704bb16a11f366663a8fd90a47") + assert d == dCorrect + assert h == "e55d89704bb16a11f366663a8fd90a47" os.remove(oname) def testsort(self): """ computeMatrixOperations sort """ - dCorrect = {"verbose": True, "scale": 1, "skip zeros": False, "nan after end": False, "sort using": "mean", "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0], "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], "sample_labels": ["SRR648667.forward", "SRR648668.forward", "SRR648669.forward", "SRR648670.forward", "SRR648667.reverse", "SRR648668.reverse", "SRR648669.reverse", "SRR648670.reverse"], "downstream": [0, 0, 0, 0, 0, 0, 0, 0], "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0], "group_labels": ["genes"], "bin size": [10, 10, 10, 10, 10, 10, 10, 10], "upstream": [0, 0, 0, 0, 0, 0, 0, 0], "group_boundaries": [0, 196], "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800], "max threshold": None, "ref point": [None, None, None, None, None, None, None, None], "min threshold": None, "sort regions": "no", "proc number": 20, "bin avg type": "mean", "missing data as zero": False} + dCorrect = { + "verbose": True, + "scale": 1, + "skip zeros": False, + "nan after end": False, + "sort using": "mean", + "unscaled 5 prime": [0, 0, 0, 0, 0, 0, 0, 0], + "body": [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000], + "sample_labels": [ + "SRR648667.forward", + "SRR648668.forward", + "SRR648669.forward", + "SRR648670.forward", + "SRR648667.reverse", + "SRR648668.reverse", + "SRR648669.reverse", + "SRR648670.reverse", + ], + "downstream": [0, 0, 0, 0, 0, 0, 0, 0], + "unscaled 3 prime": [0, 0, 0, 0, 0, 0, 0, 0], + "group_labels": ["genes"], + "bin size": [10, 10, 10, 10, 10, 10, 10, 10], + "upstream": [0, 0, 0, 0, 0, 0, 0, 0], + "group_boundaries": [0, 196], + "sample_boundaries": [0, 100, 200, 300, 400, 500, 600, 700, 800], + "max threshold": None, + "ref point": [None, None, None, None, None, None, None, None], + "min threshold": None, + "sort regions": "no", + "proc number": 20, + "bin avg type": "mean", + "missing data as zero": False, + } oname = "/tmp/sorted.mat.gz" args = "sort -m {} -o {} -R {}".format(self.matrix, oname, self.bed) args = args.split() @@ -155,6 +481,6 @@ def testsort(self): d = getHeader(f) # Skip the header, which can be in a different order h = hashlib.md5(f.read()).hexdigest() f.close() - assert(d == dCorrect) - assert(h == "10ea07d1aa58f44625abe2142ef76094") + assert d == dCorrect + assert h == "10ea07d1aa58f44625abe2142ef76094" os.remove(oname) diff --git a/deeptools/test/test_countReadsPerBin.py b/deeptools/test/test_countReadsPerBin.py index 52941071d..6bd070246 100644 --- a/deeptools/test/test_countReadsPerBin.py +++ b/deeptools/test/test_countReadsPerBin.py @@ -5,13 +5,12 @@ import numpy.testing as nt import os.path -__author__ = 'Fidel' +__author__ = "Fidel" ROOT = os.path.dirname(os.path.abspath(__file__)) + "/test_data/" class TestCountReadsPerBin(object): - def setUp(self): """ The distribution of reads between the two bam files is as follows. @@ -32,115 +31,98 @@ def setUp(self): self.bamFile1 = self.root + "testA.bam" self.bamFile2 = self.root + "testB.bam" self.bamFile_PE = self.root + "test_paired2.bam" - self.chrom = '3R' + self.chrom = "3R" step_size = 50 bin_length = 25 - self.c = cr.CountReadsPerBin([self.bamFile1, self.bamFile2], - binLength=bin_length, - stepSize=step_size) + self.c = cr.CountReadsPerBin( + [self.bamFile1, self.bamFile2], binLength=bin_length, stepSize=step_size + ) def test_count_reads_in_region(self): self.c.skipZeros = False resp, _ = self.c.count_reads_in_region(self.chrom, 0, 200) - nt.assert_equal(resp, np.array([[0, 0.], - [0, 1.], - [1, 1.], - [1, 2.]])) + nt.assert_equal(resp, np.array([[0, 0.0], [0, 1.0], [1, 1.0], [1, 2.0]])) def test_count_reads_in_region_extension_1(self): """ In this case when read extension is smaller than read length extension is turned off and a warning is printed. """ - self.c = cr.CountReadsPerBin([self.bamFile1, self.bamFile2], - binLength=1, - stepSize=50, - extendReads=25) + self.c = cr.CountReadsPerBin( + [self.bamFile1, self.bamFile2], binLength=1, stepSize=50, extendReads=25 + ) resp, _ = self.c.count_reads_in_region(self.chrom, 0, 200) - nt.assert_equal(resp, np.array([[0, 0.], - [0, 1.], - [1, 1.], - [1, 2.]])) + nt.assert_equal(resp, np.array([[0, 0.0], [0, 1.0], [1, 1.0], [1, 2.0]])) def test_count_reads_in_region_total(self): - """ count the reads over the whole region + """count the reads over the whole region 2 for the first case, and 4 for the second """ self.c.skipZeros = False self.c.stepSize = 200 self.c.binLength = 200 resp, _ = self.c.count_reads_in_region(self.chrom, 0, 200) - nt.assert_equal(resp, np.array([[2, 4.]])) + nt.assert_equal(resp, np.array([[2, 4.0]])) def test_countReadsInRegions_min_mapping_quality(self): # Test min mapping quality. self.c.minMappingQuality = 40 self.c.skipZeros = False - resp, _ = self.c.count_reads_in_region(self. chrom, 0, 200) - nt.assert_equal(resp, np.array([[0, 0, 0, 1.], - [0, 0, 0, 1.]]).T) + resp, _ = self.c.count_reads_in_region(self.chrom, 0, 200) + nt.assert_equal(resp, np.array([[0, 0, 0, 1.0], [0, 0, 0, 1.0]]).T) def test_count_reads_in_region_ignore_duplicates(self): - # Test ignore duplicates self.c.skipZeros = False self.c.ignoreDuplicates = True resp, _ = self.c.count_reads_in_region(self.chrom, 0, 200) - nt.assert_equal(resp, np.array([[0, 0, 1, 1.], - [0, 1, 1, 1.]]).T) + nt.assert_equal(resp, np.array([[0, 0, 1, 1.0], [0, 1, 1, 1.0]]).T) def test_count_reads_in_region_ignore_bed_regions(self): # Test bed regions: bed_regions = [[self.chrom, [(10, 20)], "."], [self.chrom, [(150, 160)], "."]] self.c.skipZeros = False self.c.binLength = 10 - resp, _ = self.c.count_reads_in_region(self.chrom, 0, 200, bed_regions_list=bed_regions) - nt.assert_equal(resp, np.array([[0, 1.], - [0, 2.]]).T) + resp, _ = self.c.count_reads_in_region( + self.chrom, 0, 200, bed_regions_list=bed_regions + ) + nt.assert_equal(resp, np.array([[0, 1.0], [0, 2.0]]).T) def test_get_coverage_of_region_sam_flag_include(self): - self.c.samFlag_include = 16 # include reverse reads only self.c.bamFilesList = [self.bamFile1] - resp, _ = self.c.count_reads_in_region('3R', 0, 200) + resp, _ = self.c.count_reads_in_region("3R", 0, 200) nt.assert_array_equal(resp, np.array([[0], [0], [0], [1]])) def test_get_coverage_of_region_sam_flag_exclude(self): - self.c.samFlag_exclude = 16 # exclude reverse reads self.c.bamFilesList = [self.bamFile1] - resp, _ = self.c.count_reads_in_region('3R', 0, 200) + resp, _ = self.c.count_reads_in_region("3R", 0, 200) nt.assert_array_equal(resp, np.array([[0], [0], [1], [0]])) def test_get_coverage_of_region_large_bin(self): self.c.bamFilesList = [self.bamFile2] self.c.binLength = 200 self.c.stepSize = 200 - resp, _ = self.c.count_reads_in_region('3R', 0, 200) + resp, _ = self.c.count_reads_in_region("3R", 0, 200) nt.assert_array_equal(resp, np.array([[4]])) def test_get_coverage_of_region_ignore_duplicates(self): self.c.ignoreDuplicates = True self.c.bamFilesList = [self.bamFile2] - resp, _ = self.c.count_reads_in_region('3R', 0, 200) - nt.assert_array_equal(resp, np.array([[0.], - [1.], - [1.], - [1.]])) + resp, _ = self.c.count_reads_in_region("3R", 0, 200) + nt.assert_array_equal(resp, np.array([[0.0], [1.0], [1.0], [1.0]])) # check zero to nans self.c.zerosToNans = True - resp, _ = self.c.count_reads_in_region('3R', 0, 200) - nt.assert_array_equal(resp, np.array([[np.nan], - [1.], - [1.], - [1.]])) + resp, _ = self.c.count_reads_in_region("3R", 0, 200) + nt.assert_array_equal(resp, np.array([[np.nan], [1.0], [1.0], [1.0]])) def test_get_coverage_of_region_split_read(self): """ @@ -154,43 +136,35 @@ def test_get_coverage_of_region_split_read(self): self.c.bamFilesList = [self.bamFile1] self.c.binLength = 10 self.c.stepSize = 10 - resp, _ = self.c.count_reads_in_region('chr_cigar', 0, 100) - nt.assert_array_equal(resp, np.array([[0.], - [1.], - [1.], - [0.], - [1.], - [0.], - [0.], - [0.], - [0.], - [0.]])) + resp, _ = self.c.count_reads_in_region("chr_cigar", 0, 100) + nt.assert_array_equal( + resp, + np.array( + [[0.0], [1.0], [1.0], [0.0], [1.0], [0.0], [0.0], [0.0], [0.0], [0.0]] + ), + ) def test_get_coverage_of_region_zeros_to_nan(self): self.c.zerosToNans = True resp, _ = self.c.count_reads_in_region(self.chrom, 0, 200) - nt.assert_equal(resp, np.array([[np.nan, np.nan], - [np.nan, 1], - [1, 1], - [1, 2]])) + nt.assert_equal(resp, np.array([[np.nan, np.nan], [np.nan, 1], [1, 1], [1, 2]])) def test_bed_file(self): bed = "chr3R\t0\t10\nchr3R\t110\t120\nchr3R\t160\t180" import tempfile + bed_file = tempfile.NamedTemporaryFile(suffix=".bed", delete=False, mode="w") bed_file.write(bed) bed_file.close() - self.c = cr.CountReadsPerBin([self.bamFile2], - bedFile=[bed_file.name]) + self.c = cr.CountReadsPerBin([self.bamFile2], bedFile=[bed_file.name]) resp = self.c.run() - nt.assert_equal(resp, np.array([[0.], - [1.], - [2.]])) + nt.assert_equal(resp, np.array([[0.0], [1.0], [2.0]])) import os + os.unlink(bed_file.name) @@ -216,10 +190,10 @@ def setUp(self): self.bamFile1 = self.root + "testA.cram" self.bamFile2 = self.root + "testB.cram" self.bamFile_PE = self.root + "test_paired2.cram" - self.chrom = '3R' + self.chrom = "3R" step_size = 50 bin_length = 25 - self.c = cr.CountReadsPerBin([self.bamFile1, self.bamFile2], - binLength=bin_length, - stepSize=step_size) + self.c = cr.CountReadsPerBin( + [self.bamFile1, self.bamFile2], binLength=bin_length, stepSize=step_size + ) diff --git a/deeptools/test/test_heatmapper.py b/deeptools/test/test_heatmapper.py index 082136fa6..d757a4802 100644 --- a/deeptools/test/test_heatmapper.py +++ b/deeptools/test/test_heatmapper.py @@ -7,7 +7,7 @@ import deeptools.utilities import json -__author__ = 'Fidel' +__author__ = "Fidel" ROOT = os.path.dirname(os.path.abspath(__file__)) + "/test_heatmapper/" @@ -30,14 +30,22 @@ def cmpMatrices(f1, f2): p2 = json.loads(l2[1:]) for k, v in p1.items(): if k not in p2.keys(): - sys.stderr.write("key in {} missing: {} not in {}\n".format(f1, k, p2.keys())) + sys.stderr.write( + "key in {} missing: {} not in {}\n".format(f1, k, p2.keys()) + ) rv = False if p1[k] != p2[k]: - sys.stderr.write("values of '{}' is different: {} not in {}\n".format(k, p1[k], p2[k])) + sys.stderr.write( + "values of '{}' is different: {} not in {}\n".format( + k, p1[k], p2[k] + ) + ) rv = False for k in p2.keys(): if k not in p1.keys(): - sys.stderr.write("key in {} missing: {} not in {}\n".format(f2, k, p1.keys())) + sys.stderr.write( + "key in {} missing: {} not in {}\n".format(f2, k, p1.keys()) + ) rv = False else: if l1 != l2: @@ -49,200 +57,267 @@ def cmpMatrices(f1, f2): def test_computeMatrix_reference_point(): - args = "reference-point -R {0}/test2.bed -S {0}/test.bw -b 100 -a 100 " \ - "--outFileName /tmp/_test.mat.gz -bs 1 -p 1".format(ROOT).split() + args = ( + "reference-point -R {0}/test2.bed -S {0}/test.bw -b 100 -a 100 " + "--outFileName /tmp/_test.mat.gz -bs 1 -p 1".format(ROOT).split() + ) deeptools.computeMatrix.main(args) - os.system('gunzip -f /tmp/_test.mat.gz') - assert cmpMatrices(ROOT + '/master.mat', '/tmp/_test.mat') is True - os.remove('/tmp/_test.mat') + os.system("gunzip -f /tmp/_test.mat.gz") + assert cmpMatrices(ROOT + "/master.mat", "/tmp/_test.mat") is True + os.remove("/tmp/_test.mat") def test_computeMatrix_reference_point_center(): - args = "reference-point -R {0}/test2.bed -S {0}/test.bw -b 100 -a 100 --referencePoint center " \ - "--outFileName /tmp/_test.mat.gz -bs 1 -p 1".format(ROOT).split() + args = ( + "reference-point -R {0}/test2.bed -S {0}/test.bw -b 100 -a 100 --referencePoint center " + "--outFileName /tmp/_test.mat.gz -bs 1 -p 1".format(ROOT).split() + ) deeptools.computeMatrix.main(args) - os.system('gunzip -f /tmp/_test.mat.gz') - assert cmpMatrices(ROOT + '/master_center.mat', '/tmp/_test.mat') is True - os.remove('/tmp/_test.mat') + os.system("gunzip -f /tmp/_test.mat.gz") + assert cmpMatrices(ROOT + "/master_center.mat", "/tmp/_test.mat") is True + os.remove("/tmp/_test.mat") def test_computeMatrix_reference_point_tes(): - args = "reference-point -R {0}/test2.bed -S {0}/test.bw -b 100 -a 100 --referencePoint TES " \ - "--outFileName /tmp/_test.mat.gz -bs 1 -p 1".format(ROOT).split() + args = ( + "reference-point -R {0}/test2.bed -S {0}/test.bw -b 100 -a 100 --referencePoint TES " + "--outFileName /tmp/_test.mat.gz -bs 1 -p 1".format(ROOT).split() + ) deeptools.computeMatrix.main(args) - os.system('gunzip -f /tmp/_test.mat.gz') - assert cmpMatrices(ROOT + '/master_TES.mat', '/tmp/_test.mat') is True - os.remove('/tmp/_test.mat') + os.system("gunzip -f /tmp/_test.mat.gz") + assert cmpMatrices(ROOT + "/master_TES.mat", "/tmp/_test.mat") is True + os.remove("/tmp/_test.mat") def test_computeMatrix_reference_point_missing_data_as_zero(): - args = "reference-point -R {0}/test2.bed -S {0}/test.bw -b 100 -a 100 " \ - "--outFileName /tmp/_test.mat.gz -bs 1 -p 1 --missingDataAsZero".format(ROOT).split() + args = ( + "reference-point -R {0}/test2.bed -S {0}/test.bw -b 100 -a 100 " + "--outFileName /tmp/_test.mat.gz -bs 1 -p 1 --missingDataAsZero".format( + ROOT + ).split() + ) deeptools.computeMatrix.main(args) - os.system('gunzip -f /tmp/_test.mat.gz') - assert cmpMatrices(ROOT + '/master_nan_to_zero.mat', '/tmp/_test.mat') is True - os.remove('/tmp/_test.mat') + os.system("gunzip -f /tmp/_test.mat.gz") + assert cmpMatrices(ROOT + "/master_nan_to_zero.mat", "/tmp/_test.mat") is True + os.remove("/tmp/_test.mat") def test_computeMatrix_scale_regions(): - args = "scale-regions -R {0}/test2.bed -S {0}/test.bw -b 100 -a 100 -m 100 " \ - "--outFileName /tmp/_test2.mat.gz -bs 1 -p 1".format(ROOT).split() + args = ( + "scale-regions -R {0}/test2.bed -S {0}/test.bw -b 100 -a 100 -m 100 " + "--outFileName /tmp/_test2.mat.gz -bs 1 -p 1".format(ROOT).split() + ) deeptools.computeMatrix.main(args) - os.system('gunzip -f /tmp/_test2.mat.gz') - assert cmpMatrices(ROOT + '/master_scale_reg.mat', '/tmp/_test2.mat') is True - os.remove('/tmp/_test2.mat') + os.system("gunzip -f /tmp/_test2.mat.gz") + assert cmpMatrices(ROOT + "/master_scale_reg.mat", "/tmp/_test2.mat") is True + os.remove("/tmp/_test2.mat") def test_computeMatrix_multiple_bed(): - args = "reference-point -R {0}/group1.bed {0}/group2.bed -S {0}/test.bw -b 100 -a 100 " \ - "--outFileName /tmp/_test.mat.gz -bs 1 -p 1".format(ROOT).split() + args = ( + "reference-point -R {0}/group1.bed {0}/group2.bed -S {0}/test.bw -b 100 -a 100 " + "--outFileName /tmp/_test.mat.gz -bs 1 -p 1".format(ROOT).split() + ) deeptools.computeMatrix.main(args) - os.system('gunzip -f /tmp/_test.mat.gz') - assert cmpMatrices(ROOT + '/master_multibed.mat', '/tmp/_test.mat') is True - os.remove('/tmp/_test.mat') + os.system("gunzip -f /tmp/_test.mat.gz") + assert cmpMatrices(ROOT + "/master_multibed.mat", "/tmp/_test.mat") is True + os.remove("/tmp/_test.mat") def test_computeMatrix_region_extend_over_chr_end(): - args = "reference-point -R {0}/group1.bed {0}/group2.bed -S {0}/test.bw -b 100 -a 500 " \ - "--outFileName /tmp/_test.mat.gz -bs 1 -p 1".format(ROOT).split() + args = ( + "reference-point -R {0}/group1.bed {0}/group2.bed -S {0}/test.bw -b 100 -a 500 " + "--outFileName /tmp/_test.mat.gz -bs 1 -p 1".format(ROOT).split() + ) deeptools.computeMatrix.main(args) - os.system('gunzip -f /tmp/_test.mat.gz') - assert cmpMatrices(ROOT + '/master_extend_beyond_chr_size.mat', '/tmp/_test.mat') is True - os.remove('/tmp/_test.mat') + os.system("gunzip -f /tmp/_test.mat.gz") + assert ( + cmpMatrices(ROOT + "/master_extend_beyond_chr_size.mat", "/tmp/_test.mat") + is True + ) + os.remove("/tmp/_test.mat") def test_computeMatrix_unscaled(): - args = "scale-regions -S {0}/unscaled.bigWig -R {0}/unscaled.bed -a 300 -b 500 --unscaled5prime 100 --unscaled3prime 50 " \ - "--outFileName /tmp/_test.mat.gz -bs 10 -p 1".format(ROOT).split() + args = ( + "scale-regions -S {0}/unscaled.bigWig -R {0}/unscaled.bed -a 300 -b 500 --unscaled5prime 100 --unscaled3prime 50 " + "--outFileName /tmp/_test.mat.gz -bs 10 -p 1".format(ROOT).split() + ) deeptools.computeMatrix.main(args) - os.system('gunzip -f /tmp/_test.mat.gz') - assert cmpMatrices(ROOT + '/master_unscaled.mat', '/tmp/_test.mat') is True - os.remove('/tmp/_test.mat') + os.system("gunzip -f /tmp/_test.mat.gz") + assert cmpMatrices(ROOT + "/master_unscaled.mat", "/tmp/_test.mat") is True + os.remove("/tmp/_test.mat") def test_computeMatrix_gtf(): - args = "scale-regions -S {0}../test_data/test1.bw.bw -R {0}../test_data/test.gtf -a 300 -b 500 --unscaled5prime 20 --unscaled3prime 50 " \ - "--outFileName /tmp/_test_gtf.mat.gz -bs 10 -p 1".format(ROOT).split() + args = ( + "scale-regions -S {0}../test_data/test1.bw.bw -R {0}../test_data/test.gtf -a 300 -b 500 --unscaled5prime 20 --unscaled3prime 50 " + "--outFileName /tmp/_test_gtf.mat.gz -bs 10 -p 1".format(ROOT).split() + ) deeptools.computeMatrix.main(args) - os.system('gunzip -f /tmp/_test_gtf.mat.gz') - assert cmpMatrices(ROOT + '/master_gtf.mat', '/tmp/_test_gtf.mat') is True - os.remove('/tmp/_test_gtf.mat') + os.system("gunzip -f /tmp/_test_gtf.mat.gz") + assert cmpMatrices(ROOT + "/master_gtf.mat", "/tmp/_test_gtf.mat") is True + os.remove("/tmp/_test_gtf.mat") def test_computeMatrix_metagene(): - args = "scale-regions -S {0}../test_data/test1.bw.bw -R {0}../test_data/test.gtf -a 300 -b 500 --unscaled5prime 20 --unscaled3prime 50 " \ - "--outFileName /tmp/_test_metagene.mat.gz -bs 10 -p 1 --metagene".format(ROOT).split() + args = ( + "scale-regions -S {0}../test_data/test1.bw.bw -R {0}../test_data/test.gtf -a 300 -b 500 --unscaled5prime 20 --unscaled3prime 50 " + "--outFileName /tmp/_test_metagene.mat.gz -bs 10 -p 1 --metagene".format( + ROOT + ).split() + ) deeptools.computeMatrix.main(args) - os.system('gunzip -f /tmp/_test_metagene.mat.gz') - assert cmpMatrices(ROOT + '/master_metagene.mat', '/tmp/_test_metagene.mat') is True - os.remove('/tmp/_test_metagene.mat') + os.system("gunzip -f /tmp/_test_metagene.mat.gz") + assert cmpMatrices(ROOT + "/master_metagene.mat", "/tmp/_test_metagene.mat") is True + os.remove("/tmp/_test_metagene.mat") def test_chopRegions_body(): region = [(0, 200), (300, 400), (800, 900)] - lbins, bodybins, rbins, padLeft, padRight = deeptools.heatmapper.chopRegions(region, left=0, right=0) - assert(lbins == []) - assert(rbins == []) - assert(bodybins == region) - assert(padLeft == 0) - assert(padRight == 0) + lbins, bodybins, rbins, padLeft, padRight = deeptools.heatmapper.chopRegions( + region, left=0, right=0 + ) + assert lbins == [] + assert rbins == [] + assert bodybins == region + assert padLeft == 0 + assert padRight == 0 # Unscaled 5', 3' - lbins, bodybins, rbins, padLeft, padRight = deeptools.heatmapper.chopRegions(region, left=150, right=150) - assert(lbins == [(0, 150)]) - assert(rbins == [(350, 400), (800, 900)]) - assert(bodybins == [(150, 200), (300, 350)]) - assert(padLeft == 0) - assert(padRight == 0) + lbins, bodybins, rbins, padLeft, padRight = deeptools.heatmapper.chopRegions( + region, left=150, right=150 + ) + assert lbins == [(0, 150)] + assert rbins == [(350, 400), (800, 900)] + assert bodybins == [(150, 200), (300, 350)] + assert padLeft == 0 + assert padRight == 0 def test_chopRegions_TSS(): region = [(0, 200), (300, 400), (800, 900)] # + strand, 250 downstream - downstream, body, unscaled3prime, padRight, _ = deeptools.heatmapper.chopRegions(region, left=250) - assert(downstream == [(0, 200), (300, 350)]) - assert(body == [(350, 400), (800, 900)]) - assert(unscaled3prime == []) - assert(padRight == 0) - assert(_ == 0) + downstream, body, unscaled3prime, padRight, _ = deeptools.heatmapper.chopRegions( + region, left=250 + ) + assert downstream == [(0, 200), (300, 350)] + assert body == [(350, 400), (800, 900)] + assert unscaled3prime == [] + assert padRight == 0 + assert _ == 0 # + strand, 500 downstream - downstream, body, unscaled3prime, padRight, _ = deeptools.heatmapper.chopRegions(region, left=500) - assert(downstream == region) - assert(body == []) - assert(unscaled3prime == []) - assert(padRight == 100) - assert(_ == 0) + downstream, body, unscaled3prime, padRight, _ = deeptools.heatmapper.chopRegions( + region, left=500 + ) + assert downstream == region + assert body == [] + assert unscaled3prime == [] + assert padRight == 100 + assert _ == 0 # - strand, 250 downstream (labeled "upstream" due to being on the - strand) - unscaled5prime, body, upstream, _, padLeft = deeptools.heatmapper.chopRegions(region, right=250) - assert(upstream == [(150, 200), (300, 400), (800, 900)]) - assert(body == [(0, 150)]) - assert(unscaled5prime == []) - assert(padLeft == 0) - assert(_ == 0) + unscaled5prime, body, upstream, _, padLeft = deeptools.heatmapper.chopRegions( + region, right=250 + ) + assert upstream == [(150, 200), (300, 400), (800, 900)] + assert body == [(0, 150)] + assert unscaled5prime == [] + assert padLeft == 0 + assert _ == 0 # - strand, 500 downstream (labeled "upstream" due to being on the - strand) - unscaled5prime, body, upstream, _, padLeft = deeptools.heatmapper.chopRegions(region, right=500) - assert(upstream == region) - assert(body == []) - assert(unscaled5prime == []) - assert(padLeft == 100) - assert(_ == 0) + unscaled5prime, body, upstream, _, padLeft = deeptools.heatmapper.chopRegions( + region, right=500 + ) + assert upstream == region + assert body == [] + assert unscaled5prime == [] + assert padLeft == 100 + assert _ == 0 def test_chopRegions_TES(): region = [(0, 200), (300, 400), (800, 900)] # + strand, 250 upstream - unscaled5prime, body, upstream, _, padLeft = deeptools.heatmapper.chopRegions(region, right=250) - assert(unscaled5prime == []) - assert(body == [(0, 150)]) - assert(upstream == [(150, 200), (300, 400), (800, 900)]) - assert(_ == 0) - assert(padLeft == 0) + unscaled5prime, body, upstream, _, padLeft = deeptools.heatmapper.chopRegions( + region, right=250 + ) + assert unscaled5prime == [] + assert body == [(0, 150)] + assert upstream == [(150, 200), (300, 400), (800, 900)] + assert _ == 0 + assert padLeft == 0 # + strand, 500 upstream - unscaled5prime, body, upstream, _, padLeft = deeptools.heatmapper.chopRegions(region, right=500) - assert(unscaled5prime == []) - assert(body == []) - assert(upstream == region) - assert(_ == 0) - assert(padLeft == 100) + unscaled5prime, body, upstream, _, padLeft = deeptools.heatmapper.chopRegions( + region, right=500 + ) + assert unscaled5prime == [] + assert body == [] + assert upstream == region + assert _ == 0 + assert padLeft == 100 # + strand, 250 downstream (labeled "upstream" due to being on the - strand) - downstream, body, unscaled3prime, padRight, _ = deeptools.heatmapper.chopRegions(region, left=250) - assert(downstream == [(0, 200), (300, 350)]) - assert(body == [(350, 400), (800, 900)]) - assert(unscaled3prime == []) - assert(padRight == 0) - assert(_ == 0) + downstream, body, unscaled3prime, padRight, _ = deeptools.heatmapper.chopRegions( + region, left=250 + ) + assert downstream == [(0, 200), (300, 350)] + assert body == [(350, 400), (800, 900)] + assert unscaled3prime == [] + assert padRight == 0 + assert _ == 0 # + strand, 500 downstream (labeled "upstream" due to being on the - strand) - downstream, body, unscaled3prime, padRight, _ = deeptools.heatmapper.chopRegions(region, left=500) - assert(downstream == region) - assert(body == []) - assert(unscaled3prime == []) - assert(padRight == 100) - assert(_ == 0) + downstream, body, unscaled3prime, padRight, _ = deeptools.heatmapper.chopRegions( + region, left=500 + ) + assert downstream == region + assert body == [] + assert unscaled3prime == [] + assert padRight == 100 + assert _ == 0 def test_chopRegionsFromMiddle(): region = [(0, 200), (300, 400), (800, 900)] # + strand, 100 upstream/200 downstream - upstream, downstream, padLeft, padRight = deeptools.heatmapper.chopRegionsFromMiddle(region, left=100, right=200) - assert(upstream == [(100, 200)]) - assert(downstream == [(300, 400), (800, 900)]) - assert(padLeft == 0) - assert(padRight == 0) + ( + upstream, + downstream, + padLeft, + padRight, + ) = deeptools.heatmapper.chopRegionsFromMiddle(region, left=100, right=200) + assert upstream == [(100, 200)] + assert downstream == [(300, 400), (800, 900)] + assert padLeft == 0 + assert padRight == 0 # + strand, 250 upstream/300 downstream - upstream, downstream, padLeft, padRight = deeptools.heatmapper.chopRegionsFromMiddle(region, left=250, right=300) - assert(upstream == [(0, 200)]) - assert(downstream == [(300, 400), (800, 900)]) - assert(padLeft == 50) - assert(padRight == 100) + ( + upstream, + downstream, + padLeft, + padRight, + ) = deeptools.heatmapper.chopRegionsFromMiddle(region, left=250, right=300) + assert upstream == [(0, 200)] + assert downstream == [(300, 400), (800, 900)] + assert padLeft == 50 + assert padRight == 100 # - strand, 100 upstream/200 downstream - upstream, downstream, padLeft, padRight = deeptools.heatmapper.chopRegionsFromMiddle(region, left=200, right=100) - assert(upstream == [(0, 200)]) - assert(downstream == [(300, 400)]) - assert(padLeft == 0) - assert(padRight == 0) + ( + upstream, + downstream, + padLeft, + padRight, + ) = deeptools.heatmapper.chopRegionsFromMiddle(region, left=200, right=100) + assert upstream == [(0, 200)] + assert downstream == [(300, 400)] + assert padLeft == 0 + assert padRight == 0 # - strand, 250 upstream/300 downstream - upstream, downstream, padLeft, padRight = deeptools.heatmapper.chopRegionsFromMiddle(region, left=300, right=250) - assert(upstream == [(0, 200)]) - assert(downstream == [(300, 400), (800, 900)]) - assert(padLeft == 100) - assert(padRight == 50) + ( + upstream, + downstream, + padLeft, + padRight, + ) = deeptools.heatmapper.chopRegionsFromMiddle(region, left=300, right=250) + assert upstream == [(0, 200)] + assert downstream == [(300, 400), (800, 900)] + assert padLeft == 100 + assert padRight == 50 diff --git a/deeptools/test/test_multiBamSummary.py b/deeptools/test/test_multiBamSummary.py index c1716352e..cdf7cdf91 100644 --- a/deeptools/test/test_multiBamSummary.py +++ b/deeptools/test/test_multiBamSummary.py @@ -14,43 +14,49 @@ def test_multiBamSummary_gtf(): - outfile = '/tmp/_test.npz' + outfile = "/tmp/_test.npz" for fname in [BAM, CRAM]: - args = 'BED-file --BED {0} -b {1} {1} -o {2}'.format(GTF, fname, outfile).split() + args = "BED-file --BED {0} -b {1} {1} -o {2}".format( + GTF, fname, outfile + ).split() mbs.main(args) resp = np.load(outfile) - matrix = resp['matrix'] - labels = resp['labels'] + matrix = resp["matrix"] + labels = resp["labels"] if fname == BAM: - nt.assert_equal(labels, ['test1.bam', 'test1.bam']) + nt.assert_equal(labels, ["test1.bam", "test1.bam"]) else: - nt.assert_equal(labels, ['test1.cram', 'test1.cram']) - nt.assert_allclose(matrix, np.array([[144.0, 144.0], - [143.0, 143.0]])) + nt.assert_equal(labels, ["test1.cram", "test1.cram"]) + nt.assert_allclose(matrix, np.array([[144.0, 144.0], [143.0, 143.0]])) unlink(outfile) def test_multiBamSummary_metagene(): - outfile = '/tmp/_test.npz' + outfile = "/tmp/_test.npz" for fname in [BAM, CRAM]: - args = 'BED-file --BED {0} -b {1} {1} -o {2} --metagene'.format(GTF, fname, outfile).split() + args = "BED-file --BED {0} -b {1} {1} -o {2} --metagene".format( + GTF, fname, outfile + ).split() mbs.main(args) resp = np.load(outfile) - matrix = resp['matrix'] - labels = resp['labels'] + matrix = resp["matrix"] + labels = resp["labels"] if fname == BAM: - nt.assert_equal(labels, ['test1.bam', 'test1.bam']) + nt.assert_equal(labels, ["test1.bam", "test1.bam"]) else: - nt.assert_equal(labels, ['test1.cram', 'test1.cram']) - nt.assert_allclose(matrix, np.array([[25.0, 25.0], - [31.0, 31.0]])) + nt.assert_equal(labels, ["test1.cram", "test1.cram"]) + nt.assert_allclose(matrix, np.array([[25.0, 25.0], [31.0, 31.0]])) unlink(outfile) def test_multiBamSummary_scalingFactors(): - outfile = '/tmp/test.scalingFactors.txt' - args = 'bins --binSize 50 -b {} {} --scalingFactors {}'.format(BAMA, BAMB, outfile).split() + outfile = "/tmp/test.scalingFactors.txt" + args = "bins --binSize 50 -b {} {} --scalingFactors {}".format( + BAMA, BAMB, outfile + ).split() mbs.main(args) - resp = open(outfile).read().strip().split('\n') - nt.assert_equal(resp, ["sample\tscalingFactor", "testA.bam\t1.1892", "testB.bam\t0.8409"]) + resp = open(outfile).read().strip().split("\n") + nt.assert_equal( + resp, ["sample\tscalingFactor", "testA.bam\t1.1892", "testB.bam\t0.8409"] + ) unlink(outfile) diff --git a/deeptools/test/test_plotCoverage.py b/deeptools/test/test_plotCoverage.py index 6fe661948..2ce242db7 100644 --- a/deeptools/test/test_plotCoverage.py +++ b/deeptools/test/test_plotCoverage.py @@ -5,7 +5,7 @@ import deeptools.plotCoverage import deeptools.utilities -__author__ = 'Bjoern' +__author__ = "Bjoern" TEST_DATA = os.path.dirname(os.path.abspath(__file__)) + "/test_data/" ROOT = os.path.dirname(os.path.abspath(__file__)) + "/test_plotCoverage/" @@ -14,17 +14,32 @@ def test_plotCoverage_default(): - plotfile = NamedTemporaryFile(suffix='.png', prefix='deeptools_testfile_', delete=False) - txtfile = NamedTemporaryFile(suffix='.tab', prefix='deeptools_testfile_', delete=False) + plotfile = NamedTemporaryFile( + suffix=".png", prefix="deeptools_testfile_", delete=False + ) + txtfile = NamedTemporaryFile( + suffix=".tab", prefix="deeptools_testfile_", delete=False + ) for fmat in ["bam", "cram"]: - args = "--bamfiles {0}test1.{3} {0}test2.{3} --plotFile {1}" \ - " --plotFileFormat png --outRawCounts {2}".format(TEST_DATA, plotfile.name, txtfile.name, fmat).split() + args = ( + "--bamfiles {0}test1.{3} {0}test2.{3} --plotFile {1}" + " --plotFileFormat png --outRawCounts {2}".format( + TEST_DATA, plotfile.name, txtfile.name, fmat + ).split() + ) deeptools.plotCoverage.main(args) if fmat == "bam": - assert filecmp.cmp(os.path.join(ROOT, 'outRawCounts_default.tabular'), txtfile.name) is True + assert ( + filecmp.cmp( + os.path.join(ROOT, "outRawCounts_default.tabular"), txtfile.name + ) + is True + ) - res = compare_images(ROOT + 'plotCoverage_default.png', plotfile.name, tolerance) + res = compare_images( + ROOT + "plotCoverage_default.png", plotfile.name, tolerance + ) assert res is None, res os.remove(txtfile.name) os.remove(plotfile.name) diff --git a/deeptools/test/test_readFiltering.py b/deeptools/test/test_readFiltering.py index 65c5a43f3..c526dc2de 100644 --- a/deeptools/test/test_readFiltering.py +++ b/deeptools/test/test_readFiltering.py @@ -17,15 +17,17 @@ def test_estimate_read_filtering_minimal(): """ Minimal testing """ - outfile = '/tmp/test_minimal.txt' - args = '-b {} -o {}'.format(BAMFILE_FILTER, outfile).split() + outfile = "/tmp/test_minimal.txt" + args = "-b {} -o {}".format(BAMFILE_FILTER, outfile).split() est.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n', - 'test_filtering.bam\t193\t193\t0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n'] + expected = [ + "Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n", + "test_filtering.bam\t193\t193\t0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\n", + ] # strip the path from the output _ = resp[1].split("\t") _[0] = os.path.basename(_[0]) @@ -38,19 +40,23 @@ def test_estimate_read_filtering_params(): """ --minMappingQuality 10 --samFlagExclude 512 --ignoreDuplicates -bl """ - outfile = '/tmp/test_params.txt' - args = '-b {} --minMappingQuality 10 --samFlagExclude 512 --ignoreDuplicates -bl {} -o {}'.format(BAMFILE_FILTER, BEDFILE_FILTER, outfile).split() + outfile = "/tmp/test_params.txt" + args = "-b {} --minMappingQuality 10 --samFlagExclude 512 --ignoreDuplicates -bl {} -o {}".format( + BAMFILE_FILTER, BEDFILE_FILTER, outfile + ).split() est.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() # strip the path from the output _ = resp[1].split("\t") _[0] = os.path.basename(_[0]) resp[1] = "\t".join(_) - expected = ['Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n', - 'test_filtering.bam\t193\t193\t7\t193\t41.4\t0.0\t186.5\t31.6\t0.0\t0.0\t0.0\n'] + expected = [ + "Sample\tTotal Reads\tMapped Reads\tAlignments in blacklisted regions\tEstimated mapped reads filtered\tBelow MAPQ\tMissing Flags\tExcluded Flags\tInternally-determined Duplicates\tMarked Duplicates\tSingletons\tWrong strand\n", + "test_filtering.bam\t193\t193\t7\t193\t41.4\t0.0\t186.5\t31.6\t0.0\t0.0\t0.0\n", + ] assert_equal(resp, expected) unlink(outfile) @@ -59,27 +65,31 @@ def test_sieve(): """ Test filtering a BAM file by MAPQ, flag, and blacklist """ - outfile = '/tmp/test_sieve.bam' - outfiltered = '/tmp/test_sieveFiltered.bam' - outlog = '/tmp/test_sieve.log' - args = '-b {} --smartLabels --minMappingQuality 10 --samFlagExclude 512 -bl {} -o {} --filterMetrics {} --filteredOutReads {}'.format(BAMFILE_FILTER, BEDFILE_FILTER, outfile, outlog, outfiltered).split() + outfile = "/tmp/test_sieve.bam" + outfiltered = "/tmp/test_sieveFiltered.bam" + outlog = "/tmp/test_sieve.log" + args = "-b {} --smartLabels --minMappingQuality 10 --samFlagExclude 512 -bl {} -o {} --filterMetrics {} --filteredOutReads {}".format( + BAMFILE_FILTER, BEDFILE_FILTER, outfile, outlog, outfiltered + ).split() sieve.main(args) - _foo = open(outlog, 'r') + _foo = open(outlog, "r") resp = _foo.readlines() _foo.close() - expected = ['#bamFilterReads --filterMetrics\n', - '#File\tReads Remaining\tTotal Initial Reads\n', - 'test_filtering\t5\t193\n'] + expected = [ + "#bamFilterReads --filterMetrics\n", + "#File\tReads Remaining\tTotal Initial Reads\n", + "test_filtering\t5\t193\n", + ] assert_equal(resp, expected) unlink(outlog) - h = hashlib.md5(pysam.view(outfile).encode('utf-8')).hexdigest() - assert(h == "acbc4443fb0387bfd6c412af9d4fc414") + h = hashlib.md5(pysam.view(outfile).encode("utf-8")).hexdigest() + assert h == "acbc4443fb0387bfd6c412af9d4fc414" unlink(outfile) - h1 = hashlib.md5(pysam.view(outfiltered).encode('utf-8')).hexdigest() - assert(h1 == "b90befdd5f073f14acb9a38661f301ad") + h1 = hashlib.md5(pysam.view(outfiltered).encode("utf-8")).hexdigest() + assert h1 == "b90befdd5f073f14acb9a38661f301ad" unlink(outfiltered) @@ -87,39 +97,43 @@ def test_sieve_BED(): """ Test alignmentSieve with the --BED option """ - outfile = '/tmp/test_sieve.bed' - args = '-b {} --minMappingQuality 10 --BED -o {}'.format(PAIREDBAMFILE_FILTER, outfile).split() + outfile = "/tmp/test_sieve.bed" + args = "-b {} --minMappingQuality 10 --BED -o {}".format( + PAIREDBAMFILE_FILTER, outfile + ).split() sieve.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['chr2\t5000026\t5000390\n', - 'chr2\t5000303\t5000711\n', - 'chr2\t5000384\t5000531\n', - 'chr2\t5000384\t5000531\n', - 'chr2\t5000559\t5000941\n', - 'chr2\t5000736\t5001171\n', - 'chr2\t5000819\t5001228\n', - 'chr2\t5000821\t5001158\n', - 'chr2\t5000821\t5001158\n', - 'chr2\t5000821\t5001158\n', - 'chr2\t5000834\t5001249\n', - 'chr2\t5000855\t5001277\n', - 'chr2\t5000867\t5001218\n', - 'chr2\t5000925\t5001023\n', - 'chr2\t5000925\t5001023\n', - 'chr2\t5000937\t5001338\n', - 'chr2\t5001010\t5001176\n', - 'chr2\t5001025\t5001431\n', - 'chr2\t5001050\t5001436\n', - 'chr2\t5001114\t5001413\n', - 'chr2\t5001115\t5001269\n', - 'chr2\t5001115\t5001269\n', - 'chr2\t5001226\t5001603\n', - 'chr2\t5001491\t5001527\n', - 'chr2\t5001700\t5001736\n'] + expected = [ + "chr2\t5000026\t5000390\n", + "chr2\t5000303\t5000711\n", + "chr2\t5000384\t5000531\n", + "chr2\t5000384\t5000531\n", + "chr2\t5000559\t5000941\n", + "chr2\t5000736\t5001171\n", + "chr2\t5000819\t5001228\n", + "chr2\t5000821\t5001158\n", + "chr2\t5000821\t5001158\n", + "chr2\t5000821\t5001158\n", + "chr2\t5000834\t5001249\n", + "chr2\t5000855\t5001277\n", + "chr2\t5000867\t5001218\n", + "chr2\t5000925\t5001023\n", + "chr2\t5000925\t5001023\n", + "chr2\t5000937\t5001338\n", + "chr2\t5001010\t5001176\n", + "chr2\t5001025\t5001431\n", + "chr2\t5001050\t5001436\n", + "chr2\t5001114\t5001413\n", + "chr2\t5001115\t5001269\n", + "chr2\t5001115\t5001269\n", + "chr2\t5001226\t5001603\n", + "chr2\t5001491\t5001527\n", + "chr2\t5001700\t5001736\n", + ] assert_equal(resp, expected) unlink(outfile) @@ -129,37 +143,41 @@ def test_sieve_BED_shift(): """ Test alignmentSieve --BED --shift """ - outfile = '/tmp/test_sieve_shift.bed' - args = '-b {} --minMappingQuality 10 --BED -o {} --shift 1 -2 3 -4'.format(PAIREDBAMFILE_FILTER, outfile).split() + outfile = "/tmp/test_sieve_shift.bed" + args = "-b {} --minMappingQuality 10 --BED -o {} --shift 1 -2 3 -4".format( + PAIREDBAMFILE_FILTER, outfile + ).split() sieve.main(args) - _foo = open(outfile, 'r') + _foo = open(outfile, "r") resp = _foo.readlines() _foo.close() - expected = ['chr2\t5000027\t5000388\n', - 'chr2\t5000307\t5000708\n', - 'chr2\t5000388\t5000528\n', - 'chr2\t5000385\t5000529\n', - 'chr2\t5000560\t5000939\n', - 'chr2\t5000737\t5001169\n', - 'chr2\t5000823\t5001225\n', - 'chr2\t5000825\t5001155\n', - 'chr2\t5000825\t5001155\n', - 'chr2\t5000825\t5001155\n', - 'chr2\t5000835\t5001247\n', - 'chr2\t5000859\t5001274\n', - 'chr2\t5000868\t5001216\n', - 'chr2\t5000929\t5001020\n', - 'chr2\t5000929\t5001020\n', - 'chr2\t5000941\t5001335\n', - 'chr2\t5001011\t5001174\n', - 'chr2\t5001026\t5001429\n', - 'chr2\t5001054\t5001433\n', - 'chr2\t5001118\t5001410\n', - 'chr2\t5001119\t5001266\n', - 'chr2\t5001119\t5001266\n', - 'chr2\t5001230\t5001600\n'] + expected = [ + "chr2\t5000027\t5000388\n", + "chr2\t5000307\t5000708\n", + "chr2\t5000388\t5000528\n", + "chr2\t5000385\t5000529\n", + "chr2\t5000560\t5000939\n", + "chr2\t5000737\t5001169\n", + "chr2\t5000823\t5001225\n", + "chr2\t5000825\t5001155\n", + "chr2\t5000825\t5001155\n", + "chr2\t5000825\t5001155\n", + "chr2\t5000835\t5001247\n", + "chr2\t5000859\t5001274\n", + "chr2\t5000868\t5001216\n", + "chr2\t5000929\t5001020\n", + "chr2\t5000929\t5001020\n", + "chr2\t5000941\t5001335\n", + "chr2\t5001011\t5001174\n", + "chr2\t5001026\t5001429\n", + "chr2\t5001054\t5001433\n", + "chr2\t5001118\t5001410\n", + "chr2\t5001119\t5001266\n", + "chr2\t5001119\t5001266\n", + "chr2\t5001230\t5001600\n", + ] assert_equal(resp, expected) unlink(outfile) diff --git a/deeptools/test/test_writeBedGraph.py b/deeptools/test/test_writeBedGraph.py index c419684f3..9fb1aece8 100644 --- a/deeptools/test/test_writeBedGraph.py +++ b/deeptools/test/test_writeBedGraph.py @@ -7,11 +7,10 @@ ROOT = os.path.dirname(os.path.abspath(__file__)) + "/test_data/" -__author__ = 'fidel' +__author__ = "fidel" class TestWriteBedGraph(TestCase): - def setUp(self): """ The distribution of reads between the two bam files is as follows. @@ -33,68 +32,81 @@ def setUp(self): self.bamFile1 = self.root + "testA.bam" self.bamFile2 = self.root + "testB.bam" self.bamFile_PE = self.root + "test_paired2.bam" - self.chrom = '3R' + self.chrom = "3R" self.step_size = 50 self.bin_length = 50 - self.func_args = {'scaleFactor': 1.0} + self.func_args = {"scaleFactor": 1.0} - self.c = wr.WriteBedGraph([self.bamFile1], - binLength=self.bin_length, - stepSize=self.step_size) + self.c = wr.WriteBedGraph( + [self.bamFile1], binLength=self.bin_length, stepSize=self.step_size + ) def test_writeBedGraph_worker(self): self.c.zerosToNans = False self.c.skipZeros = False - tempFile = self.c.writeBedGraph_worker('3R', 0, 200, scaleCoverage, self.func_args) - _foo = open(tempFile[3], 'r') + tempFile = self.c.writeBedGraph_worker( + "3R", 0, 200, scaleCoverage, self.func_args + ) + _foo = open(tempFile[3], "r") res = _foo.readlines() _foo.close() - assert_equal(res, ['3R\t0\t100\t0\n', '3R\t100\t200\t1\n']) + assert_equal(res, ["3R\t0\t100\t0\n", "3R\t100\t200\t1\n"]) os.remove(tempFile[3]) def test_writeBedGraph_worker_zerotonan(self): # turn on zeroToNan self.c.zerosToNans = True - tempFile2 = self.c.writeBedGraph_worker('3R', 0, 200, scaleCoverage, self.func_args) - _foo = open(tempFile2[3], 'r') + tempFile2 = self.c.writeBedGraph_worker( + "3R", 0, 200, scaleCoverage, self.func_args + ) + _foo = open(tempFile2[3], "r") res = _foo.readlines() _foo.close() - assert_equal(res, ['3R\t100\t200\t1\n']) + assert_equal(res, ["3R\t100\t200\t1\n"]) os.remove(tempFile2[3]) def test_writeBedGraph_worker_scaling(self): - func_args = {'scaleFactor': 3.0} - tempFile = self.c.writeBedGraph_worker('3R', 0, 200, scaleCoverage, func_args) - _foo = open(tempFile[3], 'r') + func_args = {"scaleFactor": 3.0} + tempFile = self.c.writeBedGraph_worker("3R", 0, 200, scaleCoverage, func_args) + _foo = open(tempFile[3], "r") res = _foo.readlines() _foo.close() - assert_equal(res, ['3R\t0\t100\t0\n', '3R\t100\t200\t3\n']) + assert_equal(res, ["3R\t0\t100\t0\n", "3R\t100\t200\t3\n"]) os.remove(tempFile[3]) def test_writeBedGraph_worker_ignore_duplicates(self): - self.c = wr.WriteBedGraph([self.bamFile2], - binLength=self.bin_length, - stepSize=self.step_size, ignoreDuplicates=True) + self.c = wr.WriteBedGraph( + [self.bamFile2], + binLength=self.bin_length, + stepSize=self.step_size, + ignoreDuplicates=True, + ) self.c.zerosToNans = True - tempFile = self.c.writeBedGraph_worker('3R', 0, 200, scaleCoverage, self.func_args) - _foo = open(tempFile[3], 'r') + tempFile = self.c.writeBedGraph_worker( + "3R", 0, 200, scaleCoverage, self.func_args + ) + _foo = open(tempFile[3], "r") res = _foo.readlines() _foo.close() - assert_equal(res, ['3R\t50\t200\t1\n']) + assert_equal(res, ["3R\t50\t200\t1\n"]) os.remove(tempFile[3]) def test_writeBedGraph_worker_smoothing(self): self.c.binLength = 20 self.c.stepSize = 20 self.c.smoothLength = 60 - tempFile = self.c.writeBedGraph_worker('3R', 100, 200, scaleCoverage, self.func_args) - _foo = open(tempFile[3], 'r') + tempFile = self.c.writeBedGraph_worker( + "3R", 100, 200, scaleCoverage, self.func_args + ) + _foo = open(tempFile[3], "r") res = _foo.readlines() _foo.close() - assert_equal(res, ['3R\t100\t120\t1\n', '3R\t120\t180\t1.33333\n', '3R\t180\t200\t1\n']) + assert_equal( + res, ["3R\t100\t120\t1\n", "3R\t120\t180\t1.33333\n", "3R\t180\t200\t1\n"] + ) os.remove(tempFile[3]) def test_writeBedGraph_cigar(self): @@ -108,17 +120,24 @@ def test_writeBedGraph_cigar(self): self.c.extendPairedEnds = False self.c.binLength = 10 self.c.stepSize = 10 - tempFile = self.c.writeBedGraph_worker('chr_cigar', 0, 100, scaleCoverage, self.func_args) - _foo = open(tempFile[3], 'r') + tempFile = self.c.writeBedGraph_worker( + "chr_cigar", 0, 100, scaleCoverage, self.func_args + ) + _foo = open(tempFile[3], "r") res = _foo.readlines() _foo.close() # the sigle read is split into bin 10-30, and then 40-50 - assert_equal(res, ['chr_cigar\t0\t10\t0\n', - 'chr_cigar\t10\t30\t1\n', - 'chr_cigar\t30\t40\t0\n', - 'chr_cigar\t40\t50\t1\n', - 'chr_cigar\t50\t100\t0\n']) + assert_equal( + res, + [ + "chr_cigar\t0\t10\t0\n", + "chr_cigar\t10\t30\t1\n", + "chr_cigar\t30\t40\t0\n", + "chr_cigar\t40\t50\t1\n", + "chr_cigar\t50\t100\t0\n", + ], + ) os.remove(tempFile[3]) @@ -132,12 +151,12 @@ def setUp(self): self.bamFile1 = self.root + "testA.cram" self.bamFile2 = self.root + "testB.cram" self.bamFile_PE = self.root + "test_paired2.cram" - self.chrom = '3R' + self.chrom = "3R" self.step_size = 50 self.bin_length = 50 - self.func_args = {'scaleFactor': 1.0} + self.func_args = {"scaleFactor": 1.0} - self.c = wr.WriteBedGraph([self.bamFile1], - binLength=self.bin_length, - stepSize=self.step_size) + self.c = wr.WriteBedGraph( + [self.bamFile1], binLength=self.bin_length, stepSize=self.step_size + ) diff --git a/deeptools/test/testskip_heatmapper_images.py b/deeptools/test/testskip_heatmapper_images.py index 8c8d56e95..599aa1ff4 100644 --- a/deeptools/test/testskip_heatmapper_images.py +++ b/deeptools/test/testskip_heatmapper_images.py @@ -1,6 +1,7 @@ import os import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") from matplotlib.testing.compare import compare_images from tempfile import NamedTemporaryFile @@ -9,7 +10,7 @@ import deeptools.plotProfile import deeptools.utilities -__author__ = 'Fidel' +__author__ = "Fidel" ROOT = os.path.dirname(os.path.abspath(__file__)) + "/test_heatmapper/" tolerance = 30 @@ -30,10 +31,12 @@ def test_plotHeatmap_simple_plot(): """ if skip: return - outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False) + outfile = NamedTemporaryFile( + suffix=".png", prefix="plotHeatmap_test_", delete=False + ) args = "-m {}/master.mat.gz --outFileName {}".format(ROOT, outfile.name).split() deeptools.plotHeatmap.main(args) - res = compare_images(ROOT + '/master.png', outfile.name, tolerance) + res = compare_images(ROOT + "/master.png", outfile.name, tolerance) assert res is None, res os.remove(outfile.name) @@ -41,11 +44,15 @@ def test_plotHeatmap_simple_plot(): def test_plotHeatmap_rename_labels(): if skip: return - outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False) + outfile = NamedTemporaryFile( + suffix=".png", prefix="plotHeatmap_test_", delete=False + ) - args = "-m {}/master.mat.gz --outFileName {} --regionsLabel uno dos".format(ROOT, outfile.name).split() + args = "-m {}/master.mat.gz --outFileName {} --regionsLabel uno dos".format( + ROOT, outfile.name + ).split() deeptools.plotHeatmap.main(args) - res = compare_images(ROOT + '/master_relabeled.png', outfile.name, tolerance) + res = compare_images(ROOT + "/master_relabeled.png", outfile.name, tolerance) assert res is None, res os.remove(outfile.name) @@ -53,10 +60,14 @@ def test_plotHeatmap_rename_labels(): def test_plotHeatmap_scale_regions(): if skip: return - outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False) - args = "-m {}/master_scale_reg.mat.gz --outFileName {}".format(ROOT, outfile.name).split() + outfile = NamedTemporaryFile( + suffix=".png", prefix="plotHeatmap_test_", delete=False + ) + args = "-m {}/master_scale_reg.mat.gz --outFileName {}".format( + ROOT, outfile.name + ).split() deeptools.plotHeatmap.main(args) - res = compare_images(ROOT + '/master_scale_reg.png', outfile.name, tolerance) + res = compare_images(ROOT + "/master_scale_reg.png", outfile.name, tolerance) assert res is None, res os.remove(outfile.name) @@ -64,11 +75,17 @@ def test_plotHeatmap_scale_regions(): def test_plotHeatmap_multi_bigwig_pergroup(): if skip: return - outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False) - args = "-m {}/master_multi.mat.gz --perGroup --samplesLabel file1 file2 file3 file4 " \ - "--outFileName {}".format(ROOT, outfile.name).split() + outfile = NamedTemporaryFile( + suffix=".png", prefix="plotHeatmap_test_", delete=False + ) + args = ( + "-m {}/master_multi.mat.gz --perGroup --samplesLabel file1 file2 file3 file4 " + "--outFileName {}".format(ROOT, outfile.name).split() + ) deeptools.plotHeatmap.main(args) - res = compare_images(ROOT + '/heatmap_master_multi_pergroup.png', outfile.name, tolerance) + res = compare_images( + ROOT + "/heatmap_master_multi_pergroup.png", outfile.name, tolerance + ) assert res is None, res os.remove(outfile.name) @@ -76,11 +93,17 @@ def test_plotHeatmap_multi_bigwig_pergroup(): def test_plotHeatmap_multiple_colors_muti_scales(): if skip: return - outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False) - args = "-m {}/master_multi.mat.gz --colorList white,blue white,red --zMin 1 0 --zMax 4 5 " \ - "--outFileName {}".format(ROOT, outfile.name).split() + outfile = NamedTemporaryFile( + suffix=".png", prefix="plotHeatmap_test_", delete=False + ) + args = ( + "-m {}/master_multi.mat.gz --colorList white,blue white,red --zMin 1 0 --zMax 4 5 " + "--outFileName {}".format(ROOT, outfile.name).split() + ) deeptools.plotHeatmap.main(args) - res = compare_images(ROOT + '/heatmap_master_multi_color.png', outfile.name, tolerance) + res = compare_images( + ROOT + "/heatmap_master_multi_color.png", outfile.name, tolerance + ) assert res is None, res os.remove(outfile.name) @@ -88,11 +111,17 @@ def test_plotHeatmap_multiple_colors_muti_scales(): def test_plotHeatmap_multiple_colormap_no_boxes(): if skip: return - outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False) - args = "-m {}/master_multi.mat.gz --colorMap Reds binary terrain --boxAroundHeatmaps no " \ - "--outFileName {}".format(ROOT, outfile.name).split() + outfile = NamedTemporaryFile( + suffix=".png", prefix="plotHeatmap_test_", delete=False + ) + args = ( + "-m {}/master_multi.mat.gz --colorMap Reds binary terrain --boxAroundHeatmaps no " + "--outFileName {}".format(ROOT, outfile.name).split() + ) deeptools.plotHeatmap.main(args) - res = compare_images(ROOT + '/heatmap_master_multi_colormap_no_box.png', outfile.name, tolerance) + res = compare_images( + ROOT + "/heatmap_master_multi_colormap_no_box.png", outfile.name, tolerance + ) assert res is None, res os.remove(outfile.name) @@ -100,11 +129,17 @@ def test_plotHeatmap_multiple_colormap_no_boxes(): def test_plotHeatmap_interpolation(): if skip: return - outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False) - args = "-m {}/large_matrix.mat.gz --interpolation bilinear " \ - "--outFileName {}".format(ROOT, outfile.name).split() + outfile = NamedTemporaryFile( + suffix=".png", prefix="plotHeatmap_test_", delete=False + ) + args = ( + "-m {}/large_matrix.mat.gz --interpolation bilinear " + "--outFileName {}".format(ROOT, outfile.name).split() + ) deeptools.plotHeatmap.main(args) - res = compare_images(ROOT + '/heatmap_master_interpolation_bilinear.png', outfile.name, tolerance) + res = compare_images( + ROOT + "/heatmap_master_interpolation_bilinear.png", outfile.name, tolerance + ) assert res is None, res os.remove(outfile.name) @@ -112,11 +147,15 @@ def test_plotHeatmap_interpolation(): def test_plotProfiler(): if skip: return - outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False) - args = "-m {}/master.mat.gz --outFileName {} --regionsLabel uno dos " \ - "--plotType std".format(ROOT, outfile.name).split() + outfile = NamedTemporaryFile( + suffix=".png", prefix="plotHeatmap_test_", delete=False + ) + args = ( + "-m {}/master.mat.gz --outFileName {} --regionsLabel uno dos " + "--plotType std".format(ROOT, outfile.name).split() + ) deeptools.plotProfile.main(args) - res = compare_images(ROOT + '/profile_master.png', outfile.name, tolerance) + res = compare_images(ROOT + "/profile_master.png", outfile.name, tolerance) assert res is None, res os.remove(outfile.name) @@ -124,10 +163,14 @@ def test_plotProfiler(): def test_plotProfiler_heatmap(): if skip: return - outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False) - args = "-m {}/master.mat.gz --outFileName {} --plotType heatmap".format(ROOT, outfile.name).split() + outfile = NamedTemporaryFile( + suffix=".png", prefix="plotHeatmap_test_", delete=False + ) + args = "-m {}/master.mat.gz --outFileName {} --plotType heatmap".format( + ROOT, outfile.name + ).split() deeptools.plotProfile.main(args) - res = compare_images(ROOT + '/profile_master_heatmap.png', outfile.name, tolerance) + res = compare_images(ROOT + "/profile_master_heatmap.png", outfile.name, tolerance) assert res is None, res os.remove(outfile.name) @@ -135,11 +178,17 @@ def test_plotProfiler_heatmap(): def test_plotProfiler_overlapped_lines(): if skip: return - outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False) - args = "-m {}/master.mat.gz --outFileName {} " \ - "--plotType overlapped_lines --yMin -1".format(ROOT, outfile.name).split() + outfile = NamedTemporaryFile( + suffix=".png", prefix="plotHeatmap_test_", delete=False + ) + args = ( + "-m {}/master.mat.gz --outFileName {} " + "--plotType overlapped_lines --yMin -1".format(ROOT, outfile.name).split() + ) deeptools.plotProfile.main(args) - res = compare_images(ROOT + '/profile_master_overlap_lines.png', outfile.name, tolerance) + res = compare_images( + ROOT + "/profile_master_overlap_lines.png", outfile.name, tolerance + ) assert res is None, res os.remove(outfile.name) @@ -147,11 +196,15 @@ def test_plotProfiler_overlapped_lines(): def test_plotProfiler_multibigwig(): if skip: return - outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False) - args = "-m {}/master_multi.mat.gz --outFileName {} " \ - "--numPlotsPerRow 2 --yMax 1.5".format(ROOT, outfile.name).split() + outfile = NamedTemporaryFile( + suffix=".png", prefix="plotHeatmap_test_", delete=False + ) + args = ( + "-m {}/master_multi.mat.gz --outFileName {} " + "--numPlotsPerRow 2 --yMax 1.5".format(ROOT, outfile.name).split() + ) deeptools.plotProfile.main(args) - res = compare_images(ROOT + '/profile_master_multi.png', outfile.name, tolerance) + res = compare_images(ROOT + "/profile_master_multi.png", outfile.name, tolerance) assert res is None, res os.remove(outfile.name) @@ -159,10 +212,16 @@ def test_plotProfiler_multibigwig(): def test_plotProfiler_multibigwig_pergroup(): if skip: return - outfile = NamedTemporaryFile(suffix='.png', prefix='plotHeatmap_test_', delete=False) - args = "-m {}/master_multi.mat.gz --outFileName {} " \ - "--perGroup --yMax 1.5".format(ROOT, outfile.name).split() + outfile = NamedTemporaryFile( + suffix=".png", prefix="plotHeatmap_test_", delete=False + ) + args = ( + "-m {}/master_multi.mat.gz --outFileName {} " + "--perGroup --yMax 1.5".format(ROOT, outfile.name).split() + ) deeptools.plotProfile.main(args) - res = compare_images(ROOT + '/profile_master_multi_pergroup.png', outfile.name, tolerance) + res = compare_images( + ROOT + "/profile_master_multi_pergroup.png", outfile.name, tolerance + ) assert res is None, res os.remove(outfile.name) diff --git a/deeptools/utilities.py b/deeptools/utilities.py index 64cc7a75c..e025d0ef0 100755 --- a/deeptools/utilities.py +++ b/deeptools/utilities.py @@ -3,7 +3,8 @@ from deeptoolsintervals import GTF from deeptools.bamHandler import openBam import matplotlib as mpl -mpl.use('Agg') + +mpl.use("Agg") from deeptools import cm # noqa: F401 import numpy as np @@ -19,7 +20,7 @@ def smartLabel(label): should be stripped. """ lab = os.path.splitext(os.path.basename(label))[0] - if lab == '': + if lab == "": # Maybe we have a dot file? lab = os.path.basename(label) return lab @@ -42,7 +43,7 @@ def convertCmap(c, vmin=0, vmax=1): colorScale = [] for k in range(255): C = list(map(np.uint8, np.array(cmap(k * h)[:3]) * 255)) - colorScale.append([k * h, 'rgb' + str((C[0], C[1], C[2]))]) + colorScale.append([k * h, "rgb" + str((C[0], C[1], C[2]))]) return colorScale @@ -81,54 +82,88 @@ def getGC_content(tb, chrom, fragStart, fragEnd, fraction=True): if fragEnd > tb.chroms(chrom): fragEnd = tb.chroms(chrom) if sum(bases.values()) < 0.95 * (fragEnd - fragStart): - raise Exception("WARNING: too many NNNs present in {}:{}-{}".format(chrom, fragStart, fragEnd)) + raise Exception( + "WARNING: too many NNNs present in {}:{}-{}".format( + chrom, fragStart, fragEnd + ) + ) return None if fraction: - return (bases['G'] + bases['C']) / float(fragEnd - fragStart) - return bases['G'] + bases['C'] + return (bases["G"] + bases["C"]) / float(fragEnd - fragStart) + return bases["G"] + bases["C"] def tbitToBamChrName(tbitNames, bamNames): - """ checks if the chromosome names from the two-bit and bam file coincide. - In case they do not coincide, a fix is tried. If successful, then - a mapping table is returned. - tbitNames and bamNames should be lists + """checks if the chromosome names from the two-bit and bam file coincide. + In case they do not coincide, a fix is tried. If successful, then + a mapping table is returned. + tbitNames and bamNames should be lists """ chrNameBitToBam = dict((x, x) for x in tbitNames) if set(bamNames) != set(tbitNames): - sys.stderr.write("Bam and 2bit do not have matching " - "chromosome names:\n2bit:{}\n\nbam:{}" - "\n\n".format(tbitNames, bamNames)) + sys.stderr.write( + "Bam and 2bit do not have matching " + "chromosome names:\n2bit:{}\n\nbam:{}" + "\n\n".format(tbitNames, bamNames) + ) if len(set(bamNames).intersection(set(tbitNames))) > 0: - sys.stderr.write("Using the following common chromosomes between " - "bam chromosome names and 2bit chromosome " - "names:\n") + sys.stderr.write( + "Using the following common chromosomes between " + "bam chromosome names and 2bit chromosome " + "names:\n" + ) for item in set(bamNames).intersection(set(tbitNames)): sys.stderr.write(item + "\n") - chrNameBitToBam = dict([(x, x) for x in - set(bamNames).intersection(set(tbitNames))]) - elif set(["chr" + x if x != 'dmel_mitochondrion_genome' - else 'chrM' for x in bamNames]) == set(tbitNames): - sys.stderr.write("Adding 'chr' seems to solve the problem. " - "Continuing ...") - chrNameBitToBam = dict([("chr" + x - if x != 'dmel_mitochondrion_genome' - else 'chrM', x) for x in bamNames]) - elif set([x for x in tbitNames if x.count('random') == 0 and - x.count('chrM') == 0]) == set(bamNames): + chrNameBitToBam = dict( + [(x, x) for x in set(bamNames).intersection(set(tbitNames))] + ) + elif set( + [ + "chr" + x if x != "dmel_mitochondrion_genome" else "chrM" + for x in bamNames + ] + ) == set(tbitNames): + sys.stderr.write( + "Adding 'chr' seems to solve the problem. " "Continuing ..." + ) + chrNameBitToBam = dict( + [ + ("chr" + x if x != "dmel_mitochondrion_genome" else "chrM", x) + for x in bamNames + ] + ) + elif set( + [x for x in tbitNames if x.count("random") == 0 and x.count("chrM") == 0] + ) == set(bamNames): if debug: - print("Removing random and mitochondrial chromosomes" - "fixes the problem") - chrNameBitToBam = dict([(x, x) for x in tbitNames - if x.count('random') == 0 and - x.count('chrM') == 0]) - elif len(set(["chr" + x for x in bamNames if x != 'dmel_mitochondrion_genome']).intersection(set(tbitNames))) > 0: - bamNames2 = ["chr" + x for x in bamNames if x != 'dmel_mitochondrion_genome'] - sys.stderr.write("Adding 'chr' seems to solve the problem for the following " - "chromosomes...") + print( + "Removing random and mitochondrial chromosomes" "fixes the problem" + ) + chrNameBitToBam = dict( + [ + (x, x) + for x in tbitNames + if x.count("random") == 0 and x.count("chrM") == 0 + ] + ) + elif ( + len( + set( + ["chr" + x for x in bamNames if x != "dmel_mitochondrion_genome"] + ).intersection(set(tbitNames)) + ) + > 0 + ): + bamNames2 = [ + "chr" + x for x in bamNames if x != "dmel_mitochondrion_genome" + ] + sys.stderr.write( + "Adding 'chr' seems to solve the problem for the following " + "chromosomes..." + ) for item in set(bamNames2).intersection(set(tbitNames)): sys.stderr.write(item + "\n") @@ -136,12 +171,21 @@ def tbitToBamChrName(tbitNames, bamNames): for i in range(len(bamNames)): if bamNames2[i] in tbitNames: chrNameBitToBam.update({bamNames2[i]: bamNames[i]}) - elif len(set([x[3:] for x in bamNames if x.startswith("chr")]).intersection(set(tbitNames))) > 0: + elif ( + len( + set([x[3:] for x in bamNames if x.startswith("chr")]).intersection( + set(tbitNames) + ) + ) + > 0 + ): bamNames = [x for x in bamNames] bamNames2 = [x[3:] for x in bamNames if x.startswith("chr")] if debug: - sys.stderr.write("Removing 'chr' seems to solve the problem for the following " - "chromosomes...") + sys.stderr.write( + "Removing 'chr' seems to solve the problem for the following " + "chromosomes..." + ) for item in set(bamNames).intersection(set(tbitNames)): sys.stderr.write(item + "\n") @@ -168,6 +212,7 @@ def getCommonChrNames(bamFileHandles, verbose=True): Hopefully, only _random and chrM are not common. """ + def get_chrom_and_size(bam_handler): """ Reads the chromosome/scaffold name and the length from @@ -195,18 +240,22 @@ def print_chr_names_and_size(chr_set): # try to add remove 'chr' from the chromosome name _corr_names_size = set() for chrom_name, size in _names_and_size: - if chrom_name.startswith('chr'): + if chrom_name.startswith("chr"): _corr_names_size.add((chrom_name[3:], size)) else: - _corr_names_size.add(('chr' + chrom_name, size)) + _corr_names_size.add(("chr" + chrom_name, size)) if len(common_chr & _corr_names_size) == 0: - message = "No common chromosomes found. Are the bam files files " \ - "from the same species and same assemblies?\n" + message = ( + "No common chromosomes found. Are the bam files files " + "from the same species and same assemblies?\n" + ) sys.stderr.write(message) print_chr_names_and_size(common_chr) - sys.stderr.write("\nand the following is the list of the unmatched chromosome and chromosome\n" - "lengths from file\n{}\n".format(bamFileHandles.name)) + sys.stderr.write( + "\nand the following is the list of the unmatched chromosome and chromosome\n" + "lengths from file\n{}\n".format(bamFileHandles.name) + ) print_chr_names_and_size(_names_and_size) exit(1) else: @@ -216,7 +265,9 @@ def print_chr_names_and_size(chr_set): common_chr = common_chr & _names_and_size if len(non_common_chr) > 0: - sys.stderr.write("\nThe following chromosome names did not match between the the bam files\n") + sys.stderr.write( + "\nThe following chromosome names did not match between the the bam files\n" + ) print_chr_names_and_size(non_common_chr) # the common chromosomes has to be sorted as in the original @@ -229,7 +280,7 @@ def print_chr_names_and_size(chr_set): return chr_sizes, non_common_chr -def copyFileInMemory(filePath, suffix=''): +def copyFileInMemory(filePath, suffix=""): """ copies a file into the special /dev/shm device which moves the file into memory. @@ -237,25 +288,27 @@ def copyFileInMemory(filePath, suffix=''): """ # fallback for windows users - if os.name == 'nt': + if os.name == "nt": return filePath memFileName = getTempFileName(suffix=suffix) import shutil + shutil.copyfile(filePath, memFileName) return memFileName -def getTempFileName(suffix=''): +def getTempFileName(suffix=""): """ Return a temporary file name. The calling function is responsible for deleting this upon completion. """ import tempfile - _tempFile = tempfile.NamedTemporaryFile(prefix="_deeptools_", - suffix=suffix, - delete=False) + + _tempFile = tempfile.NamedTemporaryFile( + prefix="_deeptools_", suffix=suffix, delete=False + ) memFileName = _tempFile.name _tempFile.close() @@ -274,7 +327,9 @@ def gtfOptions(allArgs=None): allArgs = vars(allArgs) transcriptID = allArgs.get("transcriptID", transcriptID) exonID = allArgs.get("exonID", exonID) - transcript_id_designator = allArgs.get("transcript_id_designator", transcript_id_designator) + transcript_id_designator = allArgs.get( + "transcript_id_designator", transcript_id_designator + ) keepExons = allArgs.get("keepExons", keepExons) return transcriptID, exonID, transcript_id_designator, keepExons @@ -288,7 +343,7 @@ def toString(s): if isinstance(s, bytes): if sys.version_info[0] == 2: return str(s) - return s.decode('ascii') + return s.decode("ascii") if isinstance(s, list): return [toString(x) for x in s] return s @@ -303,7 +358,7 @@ def toBytes(s): if isinstance(s, bytes): return s if isinstance(s, str): - return bytes(s, 'ascii') + return bytes(s, "ascii") if isinstance(s, list): return [toBytes(x) for x in s] return s @@ -350,13 +405,18 @@ def bam_blacklisted_worker(args): for r in fh.fetch(reference=chrom, start=start, end=end): if r.is_unmapped: continue - if r.reference_start >= start and r.reference_start + r.infer_query_length(always=False) - 1 <= end: + if ( + r.reference_start >= start + and r.reference_start + r.infer_query_length(always=False) - 1 <= end + ): blacklisted += 1 fh.close() return blacklisted -def bam_blacklisted_reads(bam_handle, chroms_to_ignore, blackListFileName=None, numberOfProcessors=1): +def bam_blacklisted_reads( + bam_handle, chroms_to_ignore, blackListFileName=None, numberOfProcessors=1 +): blacklisted = 0 if blackListFileName is None: return blacklisted @@ -367,18 +427,27 @@ def bam_blacklisted_reads(bam_handle, chroms_to_ignore, blackListFileName=None, bl = GTF(blackListFileName) hasOverlaps, minOverlap = bl.hasOverlaps(returnDistance=True) if hasOverlaps: - sys.exit("Your blacklist file(s) has (have) regions that overlap. Proceeding with such a file would result in deepTools incorrectly calculating scaling factors. As such, you MUST fix this issue before being able to proceed.\n") + sys.exit( + "Your blacklist file(s) has (have) regions that overlap. Proceeding with such a file would result in deepTools incorrectly calculating scaling factors. As such, you MUST fix this issue before being able to proceed.\n" + ) if minOverlap < 1000: - sys.stderr.write("WARNING: The minimum distance between intervals in your blacklist is {}. It makes little biological sense to include small regions between two blacklisted regions. Instead, these should likely be blacklisted as well.\n".format(minOverlap)) + sys.stderr.write( + "WARNING: The minimum distance between intervals in your blacklist is {}. It makes little biological sense to include small regions between two blacklisted regions. Instead, these should likely be blacklisted as well.\n".format( + minOverlap + ) + ) regions = [] for chrom in bl.chroms: - if (not chroms_to_ignore or chrom not in chroms_to_ignore) and chrom in chromLens: + if ( + not chroms_to_ignore or chrom not in chroms_to_ignore + ) and chrom in chromLens: for reg in bl.findOverlaps(chrom, 0, chromLens[chrom]): regions.append([bam_handle.filename, chrom, reg[0], reg[1]]) if len(regions) > 0: import multiprocessing + if len(regions) > 1 and numberOfProcessors > 1: pool = multiprocessing.Pool(numberOfProcessors) res = pool.map_async(bam_blacklisted_worker, regions).get(9999999) diff --git a/deeptools/writeBedGraph.py b/deeptools/writeBedGraph.py index b8066ea7b..2364b7580 100644 --- a/deeptools/writeBedGraph.py +++ b/deeptools/writeBedGraph.py @@ -12,7 +12,7 @@ from deeptools import utilities debug = 0 -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") def writeBedGraph_wrapper(args): @@ -28,7 +28,6 @@ def writeBedGraph_wrapper(args): class WriteBedGraph(cr.CountReadsPerBin): - r"""Reads bam files coverages and writes a bedgraph or bigwig file Extends the CountReadsPerBin object such that the coverage @@ -90,7 +89,15 @@ class WriteBedGraph(cr.CountReadsPerBin): """ - def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, format="bedgraph", smoothLength=0): + def run( + self, + func_to_call, + func_args, + out_file_name, + blackListFileName=None, + format="bedgraph", + smoothLength=0, + ): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file @@ -121,14 +128,18 @@ def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, fo bam_handles = [] for x in self.bamFilesList: if getStats: - bam, mapped, unmapped, stats = bamHandler.openBam(x, returnStats=True, nThreads=self.numberOfProcessors) + bam, mapped, unmapped, stats = bamHandler.openBam( + x, returnStats=True, nThreads=self.numberOfProcessors + ) self.mappedList.append(mapped) self.statsList.append(stats) else: bam = bamHandler.openBam(x) bam_handles.append(bam) - genome_chunk_length = getGenomeChunkLength(bam_handles, self.binLength, self.mappedList) + genome_chunk_length = getGenomeChunkLength( + bam_handles, self.binLength, self.mappedList + ) # check if both bam files correspond to the same species # by comparing the chromosome names: chrom_names_and_size, non_common = getCommonChrNames(bam_handles, verbose=False) @@ -142,14 +153,16 @@ def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, fo continue sys.stderr.write("{}: {}\n".format(x, self.__getattribute__(x))) - res = mapReduce.mapReduce([func_to_call, func_args], - writeBedGraph_wrapper, - chrom_names_and_size, - self_=self, - genomeChunkLength=genome_chunk_length, - region=self.region, - blackListFileName=blackListFileName, - numberOfProcessors=self.numberOfProcessors) + res = mapReduce.mapReduce( + [func_to_call, func_args], + writeBedGraph_wrapper, + chrom_names_and_size, + self_=self, + genomeChunkLength=genome_chunk_length, + region=self.region, + blackListFileName=blackListFileName, + numberOfProcessors=self.numberOfProcessors, + ) # Determine the sorted order of the temp files chrom_order = dict() @@ -158,11 +171,11 @@ def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, fo res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res] res.sort() - if format == 'bedgraph': - out_file = open(out_file_name, 'wb') + if format == "bedgraph": + out_file = open(out_file_name, "wb") for r in res: if r[3]: - _foo = open(r[3], 'rb') + _foo = open(r[3], "rb") shutil.copyfileobj(_foo, out_file) _foo.close() os.remove(r[3]) @@ -170,9 +183,9 @@ def run(self, func_to_call, func_args, out_file_name, blackListFileName=None, fo else: bedGraphToBigWig(chrom_names_and_size, [x[3] for x in res], out_file_name) - def writeBedGraph_worker(self, chrom, start, end, - func_to_call, func_args, - bed_regions_list=None): + def writeBedGraph_worker( + self, chrom, start, end, func_to_call, func_args, bed_regions_list=None + ): r"""Writes a bedgraph based on the read coverage found on bamFiles The given func is called to compute the desired bedgraph value @@ -225,21 +238,21 @@ def writeBedGraph_worker(self, chrom, start, end, """ if start > end: - raise NameError("start position ({0}) bigger " - "than end position ({1})".format(start, end)) + raise NameError( + "start position ({0}) bigger " + "than end position ({1})".format(start, end) + ) coverage, _ = self.count_reads_in_region(chrom, start, end) - _file = open(utilities.getTempFileName(suffix='.bg'), 'w') + _file = open(utilities.getTempFileName(suffix=".bg"), "w") previous_value = None line_string = "{}\t{}\t{}\t{:g}\n" for tileIndex in range(coverage.shape[0]): - if self.smoothLength is not None and self.smoothLength > 0: - vector_start, vector_end = self.getSmoothRange(tileIndex, - self.binLength, - self.smoothLength, - coverage.shape[0]) + vector_start, vector_end = self.getSmoothRange( + tileIndex, self.binLength, self.smoothLength, coverage.shape[0] + ) tileCoverage = np.mean(coverage[vector_start:vector_end, :], axis=0) else: tileCoverage = coverage[tileIndex, :] @@ -268,15 +281,19 @@ def writeBedGraph_worker(self, chrom, start, end, elif previous_value != value: if not np.isnan(previous_value): _file.write( - line_string.format(chrom, writeStart, writeEnd, previous_value)) + line_string.format(chrom, writeStart, writeEnd, previous_value) + ) previous_value = value writeStart = writeEnd writeEnd = min(writeStart + self.binLength, end) # write remaining value if not a nan - if previous_value is not None and writeStart != end and not np.isnan(previous_value): - _file.write(line_string.format(chrom, writeStart, - end, previous_value)) + if ( + previous_value is not None + and writeStart != end + and not np.isnan(previous_value) + ): + _file.write(line_string.format(chrom, writeStart, end, previous_value)) tempfilename = _file.name _file.close() @@ -289,7 +306,7 @@ def bedGraphToBigWig(chromSizes, bedGraphFiles, bigWigPath): The order of bedGraphFiles must match that of chromSizes! """ bw = pyBigWig.open(bigWigPath, "w") - assert(bw is not None) + assert bw is not None bw.addHeader(chromSizes, maxZooms=10) lastChrom = None starts = [] @@ -303,7 +320,9 @@ def bedGraphToBigWig(chromSizes, bedGraphFiles, bigWigPath): # Buffer up to a million entries if interval[0] != lastChrom or len(starts) == 1000000: if lastChrom is not None: - bw.addEntries([lastChrom] * len(starts), starts, ends=ends, values=vals) + bw.addEntries( + [lastChrom] * len(starts), starts, ends=ends, values=vals + ) lastChrom = interval[0] starts = [int(interval[1])] ends = [int(interval[2])] @@ -344,7 +363,7 @@ def scaleCoverage(tile_coverage, args): """ tileCoverage should be an list with only one element """ - return args['scaleFactor'] * tile_coverage[0] + return args["scaleFactor"] * tile_coverage[0] def ratio(tile_coverage, args): diff --git a/deeptools/writeBedGraph_bam_and_bw.py b/deeptools/writeBedGraph_bam_and_bw.py index 61cd47a42..dc5013768 100644 --- a/deeptools/writeBedGraph_bam_and_bw.py +++ b/deeptools/writeBedGraph_bam_and_bw.py @@ -16,11 +16,12 @@ from deeptools.writeBedGraph import * from deeptools import bamHandler -old_settings = np.seterr(all='ignore') +old_settings = np.seterr(all="ignore") -def getCoverageFromBigwig(bigwigHandle, chrom, start, end, tileSize, - missingDataAsZero=False): +def getCoverageFromBigwig( + bigwigHandle, chrom, start, end, tileSize, missingDataAsZero=False +): try: coverage = np.asarray(bigwigHandle.values(chrom, start, end)) except TypeError: @@ -33,8 +34,8 @@ def getCoverageFromBigwig(bigwigHandle, chrom, start, end, tileSize, coverage[np.isnan(coverage)] = 0 # average the values per bin cov = np.array( - [np.mean(coverage[x:x + tileSize]) - for x in range(0, len(coverage), tileSize)]) + [np.mean(coverage[x : x + tileSize]) for x in range(0, len(coverage), tileSize)] + ) return cov @@ -43,9 +44,20 @@ def writeBedGraph_wrapper(args): def writeBedGraph_worker( - chrom, start, end, tileSize, defaultFragmentLength, - bamOrBwFileList, func, funcArgs, extendPairedEnds=True, smoothLength=0, - skipZeroOverZero=False, missingDataAsZero=False, fixed_step=False): + chrom, + start, + end, + tileSize, + defaultFragmentLength, + bamOrBwFileList, + func, + funcArgs, + extendPairedEnds=True, + smoothLength=0, + skipZeroOverZero=False, + missingDataAsZero=False, + fixed_step=False, +): r""" Writes a bedgraph having as base a number of bam files. @@ -55,25 +67,35 @@ def writeBedGraph_worker( tileSize """ if start > end: - raise NameError("start position ({0}) bigger than " - "end position ({1})".format(start, end)) + raise NameError( + "start position ({0}) bigger than " "end position ({1})".format(start, end) + ) coverage = [] for indexFile, fileFormat in bamOrBwFileList: - if fileFormat == 'bam': + if fileFormat == "bam": bamHandle = bamHandler.openBam(indexFile) - coverage.append(getCoverageFromBam( - bamHandle, chrom, start, end, tileSize, - defaultFragmentLength, extendPairedEnds, - True)) + coverage.append( + getCoverageFromBam( + bamHandle, + chrom, + start, + end, + tileSize, + defaultFragmentLength, + extendPairedEnds, + True, + ) + ) bamHandle.close() - elif fileFormat == 'bigwig': + elif fileFormat == "bigwig": bigwigHandle = pyBigWig.open(indexFile) coverage.append( getCoverageFromBigwig( - bigwigHandle, chrom, start, end, - tileSize, missingDataAsZero)) + bigwigHandle, chrom, start, end, tileSize, missingDataAsZero + ) + ) bigwigHandle.close() _file = tempfile.NamedTemporaryFile(delete=False) @@ -81,21 +103,22 @@ def writeBedGraph_worker( previousValue = None lengthCoverage = len(coverage[0]) for tileIndex in range(lengthCoverage): - tileCoverage = [] for index in range(len(bamOrBwFileList)): if smoothLength > 0: vectorStart, vectorEnd = getSmoothRange( - tileIndex, tileSize, smoothLength, lengthCoverage) - tileCoverage.append( - np.mean(coverage[index][vectorStart:vectorEnd])) + tileIndex, tileSize, smoothLength, lengthCoverage + ) + tileCoverage.append(np.mean(coverage[index][vectorStart:vectorEnd])) else: try: tileCoverage.append(coverage[index][tileIndex]) except IndexError: - sys.exit("Chromosome {} probably not in one of the bigwig " - "files. Remove this chromosome from the bigwig file " - "to continue".format(chrom)) + sys.exit( + "Chromosome {} probably not in one of the bigwig " + "files. Remove this chromosome from the bigwig file " + "to continue".format(chrom) + ) if skipZeroOverZero and np.sum(tileCoverage) == 0: previousValue = None @@ -107,11 +130,15 @@ def writeBedGraph_worker( writeStart = start + tileIndex * tileSize writeEnd = min(writeStart + tileSize, end) try: - _file.write(toBytes("%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, - writeEnd, value))) + _file.write( + toBytes("%s\t%d\t%d\t%.2f\n" % (chrom, writeStart, writeEnd, value)) + ) except TypeError: - _file.write(toBytes("{}\t{}\t{}\t{}\n".format(chrom, writeStart, - writeEnd, value))) + _file.write( + toBytes( + "{}\t{}\t{}\t{}\n".format(chrom, writeStart, writeEnd, value) + ) + ) else: if previousValue is None: writeStart = start + tileIndex * tileSize @@ -124,18 +151,26 @@ def writeBedGraph_worker( elif previousValue != value: if not np.isnan(previousValue): _file.write( - toBytes("{0}\t{1}\t{2}\t{3:g}\n".format(chrom, writeStart, - writeEnd, previousValue))) + toBytes( + "{0}\t{1}\t{2}\t{3:g}\n".format( + chrom, writeStart, writeEnd, previousValue + ) + ) + ) previousValue = value writeStart = writeEnd writeEnd = min(writeStart + tileSize, end) if not fixed_step: # write remaining value if not a nan - if previousValue and writeStart != end and \ - not np.isnan(previousValue): - _file.write(toBytes("{0}\t{1}\t{2}\t{3:g}\n".format(chrom, writeStart, - end, previousValue))) + if previousValue and writeStart != end and not np.isnan(previousValue): + _file.write( + toBytes( + "{0}\t{1}\t{2}\t{3:g}\n".format( + chrom, writeStart, end, previousValue + ) + ) + ) tempFileName = _file.name _file.close() @@ -143,10 +178,23 @@ def writeBedGraph_worker( def writeBedGraph( - bamOrBwFileList, outputFileName, fragmentLength, - func, funcArgs, tileSize=25, region=None, blackListFileName=None, numberOfProcessors=1, - format="bedgraph", extendPairedEnds=True, missingDataAsZero=False, - skipZeroOverZero=False, smoothLength=0, fixed_step=False, verbose=False): + bamOrBwFileList, + outputFileName, + fragmentLength, + func, + funcArgs, + tileSize=25, + region=None, + blackListFileName=None, + numberOfProcessors=1, + format="bedgraph", + extendPairedEnds=True, + missingDataAsZero=False, + skipZeroOverZero=False, + smoothLength=0, + fixed_step=False, + verbose=False, +): r""" Given a list of bamfiles, a function and a function arguments, this method writes a bedgraph file (or bigwig) file @@ -158,8 +206,10 @@ def writeBedGraph( bamHandles = [] mappedList = [] for indexedFile, fileFormat in bamOrBwFileList: - if fileFormat == 'bam': - bam, mapped, unmapped, stats = bamHandler.openBam(indexedFile, returnStats=True, nThreads=numberOfProcessors) + if fileFormat == "bam": + bam, mapped, unmapped, stats = bamHandler.openBam( + indexedFile, returnStats=True, nThreads=numberOfProcessors + ) bamHandles.append(bam) mappedList.append(mapped) @@ -173,7 +223,7 @@ def writeBedGraph( cCommon = [] chromNamesAndSize = {} for fileName, fileFormat in bamOrBwFileList: - if fileFormat == 'bigwig': + if fileFormat == "bigwig": fh = pyBigWig.open(fileName) else: continue @@ -182,37 +232,56 @@ def writeBedGraph( if chromName in chromNamesAndSize: cCommon.append(chromName) if chromNamesAndSize[chromName] != size: - print("\nWARNING\n" - "Chromosome {} length reported in the " - "input files differ.\n{} for {}\n" - "{} for {}.\n\nThe smallest " - "length will be used".format( - chromName, chromNamesAndSize[chromName], - bamOrBwFileList[0][0], size, fileName)) + print( + "\nWARNING\n" + "Chromosome {} length reported in the " + "input files differ.\n{} for {}\n" + "{} for {}.\n\nThe smallest " + "length will be used".format( + chromName, + chromNamesAndSize[chromName], + bamOrBwFileList[0][0], + size, + fileName, + ) + ) chromNamesAndSize[chromName] = min( - chromNamesAndSize[chromName], size) + chromNamesAndSize[chromName], size + ) else: chromNamesAndSize[chromName] = size fh.close() # get the list of common chromosome names and sizes - chromNamesAndSize = [(k, v) for k, v in chromNamesAndSize.items() - if k in cCommon] + chromNamesAndSize = [ + (k, v) for k, v in chromNamesAndSize.items() if k in cCommon + ] if region: # in case a region is used, append the tilesize region += ":{}".format(tileSize) - res = mapReduce.mapReduce((tileSize, fragmentLength, bamOrBwFileList, - func, funcArgs, extendPairedEnds, smoothLength, - skipZeroOverZero, missingDataAsZero, fixed_step), - writeBedGraph_wrapper, - chromNamesAndSize, - genomeChunkLength=genomeChunkLength, - region=region, - blackListFileName=blackListFileName, - numberOfProcessors=numberOfProcessors, - verbose=verbose) + res = mapReduce.mapReduce( + ( + tileSize, + fragmentLength, + bamOrBwFileList, + func, + funcArgs, + extendPairedEnds, + smoothLength, + skipZeroOverZero, + missingDataAsZero, + fixed_step, + ), + writeBedGraph_wrapper, + chromNamesAndSize, + genomeChunkLength=genomeChunkLength, + region=region, + blackListFileName=blackListFileName, + numberOfProcessors=numberOfProcessors, + verbose=verbose, + ) # Determine the sorted order of the temp files chrom_order = dict() @@ -221,11 +290,11 @@ def writeBedGraph( res = [[chrom_order[x[0]], x[1], x[2], x[3]] for x in res] res.sort() - if format == 'bedgraph': - of = open(outputFileName, 'wb') + if format == "bedgraph": + of = open(outputFileName, "wb") for r in res: if r is not None: - _ = open(r[3], 'rb') + _ = open(r[3], "rb") shutil.copyfileobj(_, of) _.close() os.remove(r[3]) diff --git a/docs/conf.py b/docs/conf.py index f88dd260f..afe444d83 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -19,7 +19,14 @@ import mock # MOCK_MODULES = ['numpy', 'numpy.ma', 'scipy', 'pyBigWig'] -MOCK_MODULES = ['pyBigWig', 'py2bit', 'plotly', 'plotly.offline', 'plotly.graph_objs', 'plotly.figure_factory'] +MOCK_MODULES = [ + "pyBigWig", + "py2bit", + "plotly", + "plotly.offline", + "plotly.graph_objs", + "plotly.figure_factory", +] for mod_name in MOCK_MODULES: sys.modules[mod_name] = mock.Mock() @@ -28,7 +35,7 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('../..')) +sys.path.insert(0, os.path.abspath("../..")) # -- General configuration ------------------------------------------------ @@ -39,13 +46,13 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.viewcode', - 'sphinx.ext.autosummary', - 'sphinxarg.ext' + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.viewcode", + "sphinx.ext.autosummary", + "sphinxarg.ext", ] # 'numpydoc' @@ -55,23 +62,25 @@ # 'sphinxcontrib.restbuilder', # Add any paths that contain templates here, relative to this directory. -templates_path = ['source/_templates'] +templates_path = ["source/_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. # source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = u'deepTools' -author = u'Fidel Ramírez, Friederike Dündar, Björn Grüning, Thomas Manke, Devon Ryan, Fabian Kilpert, ' \ - u'Andreas Richter, Vivek Bhardwaj, Steffen Heyne' +project = "deepTools" +author = ( + "Fidel Ramírez, Friederike Dündar, Björn Grüning, Thomas Manke, Devon Ryan, Fabian Kilpert, " + "Andreas Richter, Vivek Bhardwaj, Steffen Heyne" +) # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -82,6 +91,7 @@ def get_version(): import re + try: f = open("../deeptools/_version.py") except EnvironmentError: @@ -120,7 +130,7 @@ def get_version(): # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all # documents. @@ -138,7 +148,7 @@ def get_version(): # show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. # modindex_common_prefix = [] @@ -155,11 +165,12 @@ def get_version(): # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'classic' -on_rtd = os.environ.get('READTHEDOCS', None) == 'True' +on_rtd = os.environ.get("READTHEDOCS", None) == "True" if not on_rtd: # only import and set the theme if we're building docs locally import sphinx_rtd_theme - html_theme = 'sphinx_rtd_theme' + + html_theme = "sphinx_rtd_theme" html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] @@ -190,7 +201,7 @@ def get_version(): # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied @@ -254,20 +265,17 @@ def get_version(): # html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'deepToolsdoc' +htmlhelp_basename = "deepToolsdoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # 'preamble': '', - # Latex figure (float) alignment # 'figure_align': 'htbp', } @@ -276,8 +284,13 @@ def get_version(): # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'deepTools.tex', u'deepTools Documentation', - u'Fidel Ramírez, Friederike Dündar, Björn Grüning, Thomas Manke', 'manual'), + ( + master_doc, + "deepTools.tex", + "deepTools Documentation", + "Fidel Ramírez, Friederike Dündar, Björn Grüning, Thomas Manke", + "manual", + ), ] # The name of an image file (relative to this directory) to place at the top of @@ -305,10 +318,7 @@ def get_version(): # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'deeptools', u'deepTools Documentation', - [author], 1) -] +man_pages = [(master_doc, "deeptools", "deepTools Documentation", [author], 1)] # If true, show URL addresses after external links. # man_show_urls = False @@ -320,9 +330,15 @@ def get_version(): # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'deepTools', u'deepTools Documentation', - author, 'deepTools', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "deepTools", + "deepTools Documentation", + author, + "deepTools", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. diff --git a/scripts/convertChromsBigWig.py b/scripts/convertChromsBigWig.py index c4db016c8..df2b6de38 100644 --- a/scripts/convertChromsBigWig.py +++ b/scripts/convertChromsBigWig.py @@ -10,92 +10,127 @@ def parse_arguments(defaults): - parser = argparse.ArgumentParser(description='Convert chromosome names for bigwig files between ensembl, gencode and UCSC naming schemes\n' + - "Per default it writes to the same location as original file, however with a modified filename:\n" + - "eg. test.bw --> test.[toFormat]_chroms.bw\n" + - "Change this with the -o option!\n\n" + - "Mapping tables are taken from https://github.com/dpryan79/ChromosomeMappings\n\n" + - "Provided mapping options need to exactly match an existing file\n" + - "[GENOME]_[FROM_FORMAT]2[TO_FORMAT].txt in this repo!", - usage='$ convertChroms BIGWIG', formatter_class=RawTextHelpFormatter) - - parser.add_argument('bw_in_filename', - metavar='BIGWIG', - help='bigwig file that will be converted') - - parser.add_argument('--genome', '-g', - action='store', - dest='genome', - help='Genome version of original bigwig \n' + - '(GRCm37|GRCm38|GRCh37|GRCh38|BDGP6|dm3|GRCz10|GRCz11|\n' + - 'JGI_4.2|MEDAKA1|R64-1-1|WBcel235|Zv9|galGal4|rn5|rn6)\n' + - '(default: %(default)s)', - default=defaults["genome"]) - - parser.add_argument('--fromFormat', '-f', - action='store', - dest='from_format', - help='Chr naming format of original bigwig (ensembl|gencode|UCSC) (default: %(default)s)', - default=defaults["fromFormat"]) - - parser.add_argument('--toFormat', '-t', - action='store', - dest='to_format', - help='Chr naming format of converted bigwig (ensembl|gencode|UCSC) (default: %(default)s)', - default=defaults["toFormat"]) - - parser.add_argument('--outFileName', '-o', - action='store', - dest='bw_out_filename', - help='Chr naming format of converted bigwig (ensembl|gencode|UCSC) (default: %(default)s)', - default=defaults["bw_out_filename"]) - - parser.add_argument('--baseURL', '-u', - action='store', - dest='base_url', - help='base url where the mapping tables can be found (default: %(default)s)\n' + - 'Local files can be given with \'file://[BASE_DIR]/\'', - default=defaults["base_url"]) - - parser.add_argument('--verbose', '-v', - action='store_true', - dest='verbose', - help='Be more verbose where possible (default: %(default)s)', - default=defaults["verbose"]) + parser = argparse.ArgumentParser( + description="Convert chromosome names for bigwig files between ensembl, gencode and UCSC naming schemes\n" + + "Per default it writes to the same location as original file, however with a modified filename:\n" + + "eg. test.bw --> test.[toFormat]_chroms.bw\n" + + "Change this with the -o option!\n\n" + + "Mapping tables are taken from https://github.com/dpryan79/ChromosomeMappings\n\n" + + "Provided mapping options need to exactly match an existing file\n" + + "[GENOME]_[FROM_FORMAT]2[TO_FORMAT].txt in this repo!", + usage="$ convertChroms BIGWIG", + formatter_class=RawTextHelpFormatter, + ) + + parser.add_argument( + "bw_in_filename", metavar="BIGWIG", help="bigwig file that will be converted" + ) + + parser.add_argument( + "--genome", + "-g", + action="store", + dest="genome", + help="Genome version of original bigwig \n" + + "(GRCm37|GRCm38|GRCh37|GRCh38|BDGP6|dm3|GRCz10|GRCz11|\n" + + "JGI_4.2|MEDAKA1|R64-1-1|WBcel235|Zv9|galGal4|rn5|rn6)\n" + + "(default: %(default)s)", + default=defaults["genome"], + ) + + parser.add_argument( + "--fromFormat", + "-f", + action="store", + dest="from_format", + help="Chr naming format of original bigwig (ensembl|gencode|UCSC) (default: %(default)s)", + default=defaults["fromFormat"], + ) + + parser.add_argument( + "--toFormat", + "-t", + action="store", + dest="to_format", + help="Chr naming format of converted bigwig (ensembl|gencode|UCSC) (default: %(default)s)", + default=defaults["toFormat"], + ) + + parser.add_argument( + "--outFileName", + "-o", + action="store", + dest="bw_out_filename", + help="Chr naming format of converted bigwig (ensembl|gencode|UCSC) (default: %(default)s)", + default=defaults["bw_out_filename"], + ) + + parser.add_argument( + "--baseURL", + "-u", + action="store", + dest="base_url", + help="base url where the mapping tables can be found (default: %(default)s)\n" + + "Local files can be given with 'file://[BASE_DIR]/'", + default=defaults["base_url"], + ) + + parser.add_argument( + "--verbose", + "-v", + action="store_true", + dest="verbose", + help="Be more verbose where possible (default: %(default)s)", + default=defaults["verbose"], + ) return parser -def get_chromosome_mapping(genome="GRCm38", from_format="ensembl", to_format="UCSC", verbose=True, base_url='https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/'): +def get_chromosome_mapping( + genome="GRCm38", + from_format="ensembl", + to_format="UCSC", + verbose=True, + base_url="https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/", +): """ creates a dict with chromosome name mappings according to provided conversion formats default base URL access a github repo with conversion files, but you can also give eg. a path to local directory """ - mapping_file = genome + '_' + from_format + '2' + to_format + '.txt' + mapping_file = genome + "_" + from_format + "2" + to_format + ".txt" - if re.match('^file:[/]+.*', base_url): + if re.match("^file:[/]+.*", base_url): base_url = re.sub("file:[/]*(/.*)", "\\1", base_url) if verbose: - print("load mapping table (" + mapping_file + ') from ' + base_url) + print("load mapping table (" + mapping_file + ") from " + base_url) tab = None - if re.match('^https?://.*', base_url): + if re.match("^https?://.*", base_url): try: - r = requests.get(base_url + '/' + mapping_file) + r = requests.get(base_url + "/" + mapping_file) r.raise_for_status() except requests.exceptions.RequestException as e: - print("\n", e, "\n\nPlease provide correct name (GENOME, FROM_FORMAT, TO_FORMAT) for a mapping table!\n") + print( + "\n", + e, + "\n\nPlease provide correct name (GENOME, FROM_FORMAT, TO_FORMAT) for a mapping table!\n", + ) sys.exit(1) tab = r.text - elif re.match('^[/]+.*', base_url): + elif re.match("^[/]+.*", base_url): try: - tab = open(base_url + '/' + mapping_file).read() + tab = open(base_url + "/" + mapping_file).read() except IOError as e: - print("\n", e, "\n\nPlease provide a correct name (GENOME, FROM_FORMAT, TO_FORMAT) for a mapping table!\n") + print( + "\n", + e, + "\n\nPlease provide a correct name (GENOME, FROM_FORMAT, TO_FORMAT) for a mapping table!\n", + ) sys.exit(1) else: print("\nPlease provide a correct BASE_URL for a mapping table!\n") @@ -106,7 +141,7 @@ def get_chromosome_mapping(genome="GRCm38", from_format="ensembl", to_format="UC if len(ent) == 0: continue pair = ent.split("\t") - if (len(pair[1]) <= 0): + if len(pair[1]) <= 0: # if (verbose): # print("skip chrom \'" + pair[0] + "\' - cannot be mapped to "+to_format) continue @@ -129,13 +164,17 @@ def convert_bigwig(mapping_table, bw_in_filename, bw_out_filename, verbose=False for c in curr_chroms: if c not in mapping_table: - if (verbose): - print("skip original chrom \'" + c + "\' - cannot be found in mapping table! Right GENOME & FROM_FORMAT?") + if verbose: + print( + "skip original chrom '" + + c + + "' - cannot be found in mapping table! Right GENOME & FROM_FORMAT?" + ) continue final_mapping_table[c] = mapping_table[c] new_chroms[mapping_table[c]] = curr_chroms[c] - if (len(new_chroms) <= 0): + if len(new_chroms) <= 0: print("No chromosomes found for mapping! Wrong 'FROM_FORMAT'?") sys.exit(1) @@ -147,34 +186,46 @@ def convert_bigwig(mapping_table, bw_in_filename, bw_out_filename, verbose=False c_map = final_mapping_table[c] if verbose: print("convert chromosome: ", c, " --> ", c_map) - bw_out.addEntries(list(itertools.repeat(c_map, len(c_int))), [x[0] for x in c_int], ends=[x[1] for x in c_int], values=[x[2] for x in c_int]) + bw_out.addEntries( + list(itertools.repeat(c_map, len(c_int))), + [x[0] for x in c_int], + ends=[x[1] for x in c_int], + values=[x[2] for x in c_int], + ) bw_out.close() bw.close() - if (verbose): + if verbose: print("\nbigwig conversion finished!\n") def main(args=None): - defaults = { - 'genome': 'GRCm38', - 'fromFormat': 'ensembl', - 'toFormat': 'UCSC', - 'verbose': False, - 'bw_out_filename': None, - 'base_url': 'https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/' + "genome": "GRCm38", + "fromFormat": "ensembl", + "toFormat": "UCSC", + "verbose": False, + "bw_out_filename": None, + "base_url": "https://raw.githubusercontent.com/dpryan79/ChromosomeMappings/master/", } args = parse_arguments(defaults).parse_args(args) bw_out_filename = args.bw_out_filename if args.bw_out_filename is None: - bw_out_filename = re.sub(r"(.[^\.]+)$", ".%s\\1" % (args.to_format + "_chroms"), args.bw_in_filename) + bw_out_filename = re.sub( + r"(.[^\.]+)$", ".%s\\1" % (args.to_format + "_chroms"), args.bw_in_filename + ) print("\noutput_file: " + bw_out_filename) - mapping_table = get_chromosome_mapping(genome=args.genome, from_format=args.from_format, to_format=args.to_format, verbose=args.verbose, base_url=args.base_url) + mapping_table = get_chromosome_mapping( + genome=args.genome, + from_format=args.from_format, + to_format=args.to_format, + verbose=args.verbose, + base_url=args.base_url, + ) convert_bigwig(mapping_table, args.bw_in_filename, bw_out_filename, args.verbose) diff --git a/scripts/split_bed_into_multiple_files.py b/scripts/split_bed_into_multiple_files.py index 2726a3518..1135efde1 100755 --- a/scripts/split_bed_into_multiple_files.py +++ b/scripts/split_bed_into_multiple_files.py @@ -21,10 +21,10 @@ tempArray = [] for line in sys.stdin: - if line[0] == '#': + if line[0] == "#": clusterName = line[1:].strip() tempArray.append("#" + clusterName + "\n") - open(clusterName + ".bed", 'w').write("".join(tempArray)) + open(clusterName + ".bed", "w").write("".join(tempArray)) tempArray = [] continue @@ -32,4 +32,4 @@ if len(tempArray) > 0: clusterName = "no_name" - open(clusterName + ".bed", 'w').write("".join(tempArray)) + open(clusterName + ".bed", "w").write("".join(tempArray)) diff --git a/setup.py b/setup.py index c926ac0b3..bb2b40afa 100755 --- a/setup.py +++ b/setup.py @@ -28,14 +28,12 @@ def get_version(): class sdist(_sdist): - def run(self): self.distribution.metadata.version = get_version() return _sdist.run(self) class install(_install): - def run(self): self.distribution.metadata.version = get_version() _install.run(self) @@ -60,27 +58,44 @@ def openREADME(): setup( - name='deepTools', + name="deepTools", version=get_version(), - author='Fidel Ramirez, Devon P Ryan, Björn Grüning, Friederike Dündar, Sarah Diehl,' - ' Vivek Bhardwaj, Fabian Kilpert, Andreas S Richter, Steffen Heyne, Thomas Manke', - author_email='dpryan79@gmail.com', + author="Fidel Ramirez, Devon P Ryan, Björn Grüning, Friederike Dündar, Sarah Diehl," + " Vivek Bhardwaj, Fabian Kilpert, Andreas S Richter, Steffen Heyne, Thomas Manke", + author_email="dpryan79@gmail.com", packages=find_packages(), - scripts=['bin/bamCompare', 'bin/bamCoverage', 'bin/multiBamSummary', - 'bin/plotHeatmap', 'bin/plotFingerprint', 'bin/estimateScaleFactor', - 'bin/bamPEFragmentSize', 'bin/computeMatrix', 'bin/plotProfile', - 'bin/computeGCBias', 'bin/correctGCBias', 'bin/multiBigwigSummary', - 'bin/bigwigCompare', 'bin/plotCoverage', 'bin/plotPCA', 'bin/plotCorrelation', - 'bin/plotEnrichment', 'bin/deeptools', 'bin/computeMatrixOperations', - 'bin/estimateReadFiltering', 'bin/alignmentSieve'], + scripts=[ + "bin/bamCompare", + "bin/bamCoverage", + "bin/multiBamSummary", + "bin/plotHeatmap", + "bin/plotFingerprint", + "bin/estimateScaleFactor", + "bin/bamPEFragmentSize", + "bin/computeMatrix", + "bin/plotProfile", + "bin/computeGCBias", + "bin/correctGCBias", + "bin/multiBigwigSummary", + "bin/bigwigCompare", + "bin/plotCoverage", + "bin/plotPCA", + "bin/plotCorrelation", + "bin/plotEnrichment", + "bin/deeptools", + "bin/computeMatrixOperations", + "bin/estimateReadFiltering", + "bin/alignmentSieve", + ], include_package_data=True, - url='http://pypi.python.org/pypi/deepTools/', - license='LICENSE.txt', - description='Useful tools for exploring deep sequencing data ', + url="http://pypi.python.org/pypi/deepTools/", + license="LICENSE.txt", + description="Useful tools for exploring deep sequencing data ", long_description=openREADME(), classifiers=[ - 'Intended Audience :: Science/Research', - 'Topic :: Scientific/Engineering :: Bio-Informatics'], + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Bio-Informatics", + ], install_requires=[ "numpy >= 1.9.0", "scipy >= 0.17.0", @@ -90,8 +105,8 @@ def openREADME(): "pyBigWig >= 0.2.1", "py2bit >= 0.2.0", "plotly >= 4.9", - "deeptoolsintervals >= 0.1.8" + "deeptoolsintervals >= 0.1.8", ], zip_safe=True, - cmdclass={'sdist': sdist, 'install': install} + cmdclass={"sdist": sdist, "install": install}, )