diff --git a/Cargo.toml b/Cargo.toml index fdff2a94..949f9056 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,4 +14,5 @@ rust-htslib = "0.47.0" rayon = "1.10.0" itertools = "0.12.1" bigtools = "0.5.3" -tokio = "*" \ No newline at end of file +tokio = "*" +flate2 = "*" \ No newline at end of file diff --git a/pydeeptools/deeptools/computeMatrix2.py b/pydeeptools/deeptools/computeMatrix2.py new file mode 100644 index 00000000..c5c35dbb --- /dev/null +++ b/pydeeptools/deeptools/computeMatrix2.py @@ -0,0 +1,416 @@ + +import argparse +import sys +from deeptools.parserCommon import writableFile, numberOfProcessors +from deeptools import parserCommon +from importlib.metadata import version +from deeptools.hp import r_computematrix + +def parse_arguments(args=None): + parser = \ + argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=""" + +This tool calculates scores per genome regions and prepares an intermediate file that can be used with ``plotHeatmap`` and ``plotProfiles``. +Typically, the genome regions are genes, but any other regions defined in a BED file can be used. +computeMatrix accepts multiple score files (bigWig format) and multiple regions files (BED format). +This tool can also be used to filter and sort regions according +to their score. + +To learn more about the specific parameters, type: + +$ computeMatrix reference-point --help or + +$ computeMatrix scale-regions --help + +""", + epilog='An example usage is:\n computeMatrix reference-point -S ' + ' -R -b 1000\n \n') + + parser.add_argument('--version', action='version', + version='%(prog)s {}'.format(version('deeptools'))) + + subparsers = parser.add_subparsers( + title='Commands', + dest='command', + metavar='') + + # scale-regions mode options + subparsers.add_parser( + 'scale-regions', + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + parents=[ + computeMatrixRequiredArgs(), + computeMatrixOutputArgs(), + computeMatrixOptArgs(case='scale-regions'), + parserCommon.gtf_options() + ], + help="In the scale-regions mode, all regions in the BED file are " + "stretched or shrunken to the length (in bases) indicated by the user.", + usage='An example usage is:\n computeMatrix scale-regions -S ' + ' -R -b 1000\n\n') + + # reference point arguments + subparsers.add_parser( + 'reference-point', + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + parents=[computeMatrixRequiredArgs(), + computeMatrixOutputArgs(), + computeMatrixOptArgs(case='reference-point'), + parserCommon.gtf_options() + ], + help="Reference-point refers to a position within a BED region " + "(e.g., the starting point). In this mode, only those genomic" + "positions before (upstream) and/or after (downstream) of the " + "reference point will be plotted.", + usage='An example usage is:\n computeMatrix reference-point -S ' + ' -R -a 3000 -b 3000\n\n') + + return parser + + +def computeMatrixRequiredArgs(args=None): + parser = argparse.ArgumentParser(add_help=False) + required = parser.add_argument_group('Required arguments') + required.add_argument('--regionsFileName', '-R', + metavar='File', + help='File name or names, in BED or GTF format, containing ' + 'the regions to plot. If multiple bed files are given, each one is considered a ' + 'group that can be plotted separately. Also, adding a "#" symbol in the bed file ' + 'causes all the regions until the previous "#" to be considered one group.', + nargs='+', + required=True) + required.add_argument('--scoreFileName', '-S', + help='bigWig file(s) containing ' + 'the scores to be plotted. Multiple files should be separated by spaced. BigWig ' + 'files can be obtained by using the bamCoverage ' + 'or bamCompare tools. More information about ' + 'the bigWig file format can be found at ' + 'http://genome.ucsc.edu/goldenPath/help/bigWig.html ', + metavar='File', + nargs='+', + required=True) + return parser + + +def computeMatrixOutputArgs(args=None): + parser = argparse.ArgumentParser(add_help=False) + output = parser.add_argument_group('Output options') + output.add_argument('--outFileName', '-out', '-o', + help='File name to save the gzipped matrix file ' + 'needed by the "plotHeatmap" and "plotProfile" tools.', + type=writableFile, + required=True) + + output.add_argument('--outFileNameMatrix', + help='If this option is given, then the matrix ' + 'of values underlying the heatmap will be saved ' + 'using the indicated name, e.g. IndividualValues.tab.' + 'This matrix can easily be loaded into R or ' + 'other programs.', + metavar='FILE', + type=writableFile) + output.add_argument('--outFileSortedRegions', + help='File name in which the regions are saved ' + 'after skiping zeros or min/max threshold values. The ' + 'order of the regions in the file follows the sorting ' + 'order selected. This is useful, for example, to ' + 'generate other heatmaps keeping the sorting of the ' + 'first heatmap. Example: Heatmap1sortedRegions.bed', + metavar='BED file', + type=argparse.FileType('w')) + return parser + + +def computeMatrixOptArgs(case=['scale-regions', 'reference-point'][0]): + + parser = argparse.ArgumentParser(add_help=False) + optional = parser.add_argument_group('Optional arguments') + optional.add_argument('--version', action='version', + version='%(prog)s {}'.format(version('deeptools'))) + + if case == 'scale-regions': + optional.add_argument('--regionBodyLength', '-m', + default=1000, + type=int, + help='Distance in bases to which all regions will ' + 'be fit. (Default: %(default)s)') + optional.add_argument('--startLabel', + default='TSS', + help='Label shown in the plot for the start of ' + 'the region. Default is TSS (transcription ' + 'start site), but could be changed to anything, ' + 'e.g. "peak start". Note that this is only ' + 'useful if you plan to plot the results yourself ' + 'and not, for example, with plotHeatmap, which ' + 'will override this. (Default: %(default)s)') + optional.add_argument('--endLabel', + default='TES', + help='Label shown in the plot for the region ' + 'end. Default is TES (transcription end site). ' + 'See the --startLabel option for more ' + 'information. (Default: %(default)s) ') + optional.add_argument('--beforeRegionStartLength', '-b', '--upstream', + default=0, + type=int, + help='Distance upstream of the start site of ' + 'the regions defined in the region file. If the ' + 'regions are genes, this would be the distance ' + 'upstream of the transcription start site. (Default: %(default)s)') + optional.add_argument('--afterRegionStartLength', '-a', '--downstream', + default=0, + type=int, + help='Distance downstream of the end site ' + 'of the given regions. If the ' + 'regions are genes, this would be the distance ' + 'downstream of the transcription end site. (Default: %(default)s)') + optional.add_argument("--unscaled5prime", + default=0, + type=int, + help='Number of bases at the 5-prime end of the ' + 'region to exclude from scaling. By default, ' + 'each region is scaled to a given length (see the --regionBodyLength option). In some cases it is useful to look at unscaled signals around region boundaries, so this setting specifies the number of unscaled bases on the 5-prime end of each boundary. (Default: %(default)s)') + optional.add_argument("--unscaled3prime", + default=0, + type=int, + help='Like --unscaled5prime, but for the 3-prime ' + 'end. (Default: %(default)s)') + + elif case == 'reference-point': + optional.add_argument('--referencePoint', + default='TSS', + choices=['TSS', 'TES', 'center'], + help='The reference point for the plotting ' + 'could be either the region start (TSS), the ' + 'region end (TES) or the center of the region. ' + 'Note that regardless of what you specify, ' + 'plotHeatmap/plotProfile will default to using "TSS" as the ' + 'label. (Default: %(default)s)') + + # set region body length to zero for reference point mode + optional.add_argument('--regionBodyLength', help=argparse.SUPPRESS, + default=0, type=int) + optional.add_argument('--unscaled5prime', default=0, type=int, help=argparse.SUPPRESS) + optional.add_argument('--unscaled3prime', default=0, type=int, help=argparse.SUPPRESS) + optional.add_argument('--beforeRegionStartLength', '-b', '--upstream', + default=500, + type=int, + metavar='INT bp', + help='Distance upstream of the reference-point ' + 'selected. (Default: %(default)s)') + optional.add_argument('--afterRegionStartLength', '-a', '--downstream', + default=1500, + metavar='INT bp', + type=int, + help='Distance downstream of the ' + 'reference-point selected. (Default: %(default)s)') + optional.add_argument('--nanAfterEnd', + action='store_true', + help='If set, any values after the region end ' + 'are discarded. This is useful to visualize ' + 'the region end when not using the ' + 'scale-regions mode and when the reference-' + 'point is set to the TSS.') + + optional.add_argument('--binSize', '-bs', + help='Length, in bases, of the non-overlapping ' + 'bins for averaging the score over the ' + 'regions length. (Default: %(default)s)', + type=int, + default=10) + + optional.add_argument('--sortRegions', + help='Whether the output file should present the ' + 'regions sorted. The default is to not sort the regions. ' + 'Note that this is only useful if you plan to plot ' + 'the results yourself and not, for example, with ' + 'plotHeatmap, which will override this. Note also that ' + 'unsorted output will be in whatever order the regions ' + 'happen to be processed in and not match the order in ' + 'the input files. If you require the output order to ' + 'match that of the input regions, then either specify ' + '"keep" or use computeMatrixOperations to resort the ' + 'results file. (Default: %(default)s)', + choices=["descend", "ascend", "no", "keep"], + default='keep') + + optional.add_argument('--sortUsing', + help='Indicate which method should be used for ' + 'sorting. The value is computed for each row.' + 'Note that the region_length option will lead ' + 'to a dotted line within the heatmap that indicates ' + 'the end of the regions. (Default: %(default)s)', + choices=["mean", "median", "max", "min", "sum", + "region_length"], + default='mean') + + optional.add_argument('--sortUsingSamples', + help='List of sample numbers (order as in matrix), ' + 'that are used for sorting by --sortUsing, ' + 'no value uses all samples, ' + 'example: --sortUsingSamples 1 3', + type=int, nargs='+') + + optional.add_argument('--averageTypeBins', + default='mean', + choices=["mean", "median", "min", + "max", "std", "sum"], + help='Define the type of statistic that should be ' + 'used over the bin size range. The ' + 'options are: "mean", "median", "min", "max", "sum" ' + 'and "std". The default is "mean". (Default: %(default)s)') + + optional.add_argument('--missingDataAsZero', + help='If set, missing data (NAs) will be treated as zeros. ' + 'The default is to ignore such cases, which will be depicted as black areas in ' + 'a heatmap. (see the --missingDataColor argument ' + 'of the plotHeatmap command for additional options).', + action='store_true') + + optional.add_argument('--skipZeros', + help='Whether regions with only scores of zero ' + 'should be included or not. Default is to include ' + 'them.', + action='store_true') + + optional.add_argument('--minThreshold', + default=None, + type=float, + help='Numeric value. Any region containing a ' + 'value that is less than or equal to this ' + 'will be skipped. This is useful to skip, ' + 'for example, genes where the read count is zero ' + 'for any of the bins. This could be the result of ' + 'unmappable areas and can bias the overall results. (Default: %(default)s)') + + optional.add_argument('--maxThreshold', + default=None, + type=float, + help='Numeric value. Any region containing a value ' + 'greater than or equal to this ' + 'will be skipped. The maxThreshold is useful to ' + 'skip those few regions with very high read counts ' + '(e.g. micro satellites) that may bias the average ' + 'values. (Default: %(default)s)') + + optional.add_argument('--blackListFileName', '-bl', + help="A BED file containing regions that should be excluded from all analyses. Currently this works by rejecting genomic chunks that happen to overlap an entry. Consequently, for BAM files, if a read partially overlaps a blacklisted region or a fragment spans over it, then the read/fragment might still be considered.", + metavar="BED file", + required=False) + + optional.add_argument('--samplesLabel', + help='Labels for the samples. This will then be passed to plotHeatmap and plotProfile. The ' + 'default is to use the file name of the ' + 'sample. The sample labels should be separated ' + 'by spaces and quoted if a label itself' + 'contains a space E.g. --samplesLabel label-1 "label 2" ', + nargs='+') + + optional.add_argument('--smartLabels', + action='store_true', + help='Instead of manually specifying labels for the input ' + 'bigWig and BED/GTF files, this causes deepTools to use the file name ' + 'after removing the path and extension.') + + # in contrast to other tools, + # computeMatrix by default outputs + # messages and the --quiet flag supresses them + optional.add_argument('--quiet', '-q', + help='Set to remove any warning or processing ' + 'messages.', + action='store_true') + + optional.add_argument('--verbose', + help='Being VERY verbose in the status messages. --quiet will disable this.', + action='store_true') + + optional.add_argument('--scale', + help='If set, all values are multiplied by ' + 'this number. (Default: %(default)s)', + type=float, + default=1) + optional.add_argument('--numberOfProcessors', '-p', + help='Number of processors to use. Type "max/2" to ' + 'use half the maximum number of processors or "max" ' + 'to use all available processors. (Default: %(default)s)', + metavar="INT", + type=numberOfProcessors, + default=1, + required=False) + return parser + + +def process_args(args=None): + args = parse_arguments().parse_args(args) + + if len(sys.argv) == 1: + parse_arguments().print_help() + sys.exit() + + if args.quiet is True: + args.verbose = False + + # Ensure before and after region length is positive + if args.beforeRegionStartLength < 0: + print(f"beforeRegionStartLength changed from {args.beforeRegionStartLength} into {abs(args.beforeRegionStartLength)}") + args.beforeRegionStartLength = abs(args.beforeRegionStartLength) + if args.afterRegionStartLength < 0: + print(f"afterRegionStartLength changed from {args.afterRegionStartLength} into {abs(args.afterRegionStartLength)}") + args.afterRegionStartLength = abs(args.afterRegionStartLength) + + if args.command == 'scale-regions': + args.nanAfterEnd = False + args.referencePoint = None + elif args.command == 'reference-point': + if args.beforeRegionStartLength == 0 and \ + args.afterRegionStartLength == 0: + sys.exit("\nUpstrean and downstream regions are both " + "set to 0. Nothing to output. Maybe you want to " + "use the scale-regions mode?\n") + + return args + + +def main(args=None): + + args = process_args(args) + + parameters = {'upstream': args.beforeRegionStartLength, + 'downstream': args.afterRegionStartLength, + 'body': args.regionBodyLength, + 'bin size': args.binSize, + 'ref point': args.referencePoint, + 'verbose': args.verbose, + 'bin avg type': args.averageTypeBins, + 'missing data as zero': args.missingDataAsZero, + 'min threshold': args.minThreshold, + 'max threshold': args.maxThreshold, + 'scale': args.scale, + 'skip zeros': args.skipZeros, + 'nan after end': args.nanAfterEnd, + 'proc number': args.numberOfProcessors, + 'sort regions': args.sortRegions, + 'sort using': args.sortUsing, + 'unscaled 5 prime': args.unscaled5prime, + 'unscaled 3 prime': args.unscaled3prime + } + # Assert all regions and scores exist + print(parameters) + print(args) + r_computematrix( + args.command, + args.regionsFileName, + args.scoreFileName, + args.beforeRegionStartLength, + args.afterRegionStartLength, + args.unscaled5prime, + args.unscaled3prime, + args.regionBodyLength, + args.binSize, + args.missingDataAsZero, + args.referencePoint, + args.numberOfProcessors, + args.verbose, + args.outFileName + ) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 414e8fc9..bf1d7459 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,4 +84,5 @@ plotHeatmap = "deeptools.plotHeatmap:main" plotPCA = "deeptools.plotPCA:main" plotProfile = "deeptools.plotProfile:main" bamCoverage2 = "deeptools.bamCoverage2:main" -bamCompare2 = "deeptools.bamCompare2:main" \ No newline at end of file +bamCompare2 = "deeptools.bamCompare2:main" +computeMatrix2 = "deeptools.computeMatrix2:main" \ No newline at end of file diff --git a/src/computematrix.rs b/src/computematrix.rs new file mode 100644 index 00000000..b25efce4 --- /dev/null +++ b/src/computematrix.rs @@ -0,0 +1,165 @@ +use pyo3::prelude::*; +use pyo3::types::PyList; +use crate::filehandler::{read_bedfiles, chrombounds_from_bw, bwintervals, header_matrix, write_matrix}; +use rayon::prelude::*; +use rayon::ThreadPoolBuilder; +use std::collections::HashMap; +use std::path::Path; + +#[pyfunction] +pub fn r_computematrix( + mode: &str, + bedlis: Py, + bwlis: Py, + upstream: u64, + downstream: u64, + unscaled5prime: u64, + unscaled3prime: u64, + regionbodylength: u64, + binsize: u64, + missingdatazero: bool, + referencepoint: &str, + nproc: usize, + verbose: bool, + ofile: &str +) -> PyResult<()> { + + // Extract the bed and bigwig files from pyList to Vec. + let mut bed_files: Vec = Vec::new(); + let mut bw_files: Vec = Vec::new(); + Python::with_gil(|py| { + bed_files = bedlis.extract(py).expect("Failed to retrieve bed file strings."); + bw_files = bwlis.extract(py).expect("Failed to retrieve bed file strings."); + }); + // Get chromosome boundaries from first bigwig file. + let chromsizes = chrombounds_from_bw(&bw_files.get(0).unwrap()); + // compute number of columns + let bpsum = &upstream + &downstream + &unscaled5prime + &unscaled3prime + ®ionbodylength; + // Get the 'basepaths' of the bed files to use as labels later on. + let mut bedlabels: Vec = Vec::new(); + for bed in bed_files.iter() { + let entryname = Path::new(bed) + .file_stem() + .unwrap() + .to_string_lossy() + .into_owned(); + bedlabels.push(entryname); + } + // Define the scaling regions in a struct + let scale_regions = Scalingregions { + upstream: upstream, + downstream: downstream, + unscaled5prime: unscaled5prime, + unscaled3prime: unscaled3prime, + regionbodylength: regionbodylength, + binsize: binsize, + cols_expected: ((bw_files.len() as u64 * bpsum) / binsize) as usize, + missingdata_as_zero: missingdatazero, + referencepoint: referencepoint.to_string(), + mode: mode.to_string(), + bwfiles: bw_files.len(), + avgtype: "mean".to_string(), + verbose: verbose, + proc_number: nproc, + bedlabels: bedlabels + }; + // Parse regions from bed files. Note that we retain the name of the bed file (in case there are more then 1) + // Additionaly, score and strand are also retained, if it's a 3-column bed file we just fill in '.' + let (regions, regionsizes) = read_bedfiles(&bed_files); + let slopregions = slop_regions( + ®ions, + &scale_regions, + &chromsizes + ); + let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap(); + let matrix: Vec> = pool.install(|| { + bw_files.par_iter() + .map(|i| bwintervals(&i, ®ions, &slopregions, &scale_regions)) + .reduce( + || vec![vec![]; regions.len()], + |mut acc, vec_of_vecs| { + for (i, inner_vec) in vec_of_vecs.into_iter().enumerate() { + acc[i].extend(inner_vec); + } + acc + }, + ) + }); + write_matrix( + header_matrix(&scale_regions, regionsizes), + matrix, + ofile, + regions + ); + Ok(()) +} + +fn slop_regions( + regions: &Vec<(String, u64, u64, String, String, String)>, + scale_regions: &Scalingregions, + chromsizes: &HashMap +) -> Vec> { + let mut regionranges: Vec> = Vec::new(); + // Idea is to create a vector of tuples with start and end of every bin (binsize passed by computeMatrix). + // The number of columns per region needs to be fixed per region. + // Note that the before / after could mean that we run out of chromosome. + // Invalid regions (later to be encoded as NA or 0), will be pushed as (0,0) tuples. + let col_expected = scale_regions.cols_expected / scale_regions.bwfiles; + for region in regions.iter() { + // To implement: + // // scale-regions + unscaled 5 and 3 + // // + and - encodings + let chromend: u64 = *chromsizes.get(®ion.0).unwrap(); + assert!(region.2 <= chromend, "Region end goes beyond chromosome boundary. Fix your bed files. {:?} > {}", region, chromend); + assert!(region.1 <= chromend, "Region start goes beyond chromosome boundary. Fix your bed files. {:?} > {}", region, chromend); + + let anchorpoint = match scale_regions.referencepoint.as_str() { + "TSS" => region.1, + "TES" => region.2, + "center" => (region.1 + region.2) / 2, + _ => panic!("Reference should either be TSS, TES or center. {:?} is not supported.", scale_regions.referencepoint), + }; + + let mut regionsizes: Vec<(u64, u64)> = Vec::new(); + let mut absstart: i64 = anchorpoint as i64 - scale_regions.upstream as i64; + let absstop: i64 = anchorpoint as i64 + scale_regions.downstream as i64; + while absstart < absstop { + let bin = absstart + scale_regions.binsize as i64; + if absstart < 0 || bin > chromend as i64 { + regionsizes.push((0,0)); + } else { + regionsizes.push((absstart as u64, bin as u64)) + } + absstart = bin; + } + assert!( + regionsizes.len() == col_expected, + "Number of bins does not match expected number of columns: (CHROMLEN = {}) {:?} \n \n {:?}, \n \n {} != {}", + *chromsizes.get(®ion.0).unwrap(), + region, + regionsizes, + regionsizes.len(), + col_expected, + ); + regionranges.push(regionsizes); + } + return regionranges; +} + +pub struct Scalingregions { + pub upstream: u64, + pub downstream: u64, + pub unscaled5prime: u64, + pub unscaled3prime: u64, + pub regionbodylength: u64, + pub binsize: u64, + pub cols_expected: usize, + pub missingdata_as_zero: bool, + pub referencepoint: String, + pub mode: String, + pub bwfiles: usize, + pub avgtype: String, + pub verbose: bool, + pub proc_number: usize, + pub bedlabels: Vec +} \ No newline at end of file diff --git a/src/filehandler.rs b/src/filehandler.rs index ace4176a..f7142846 100644 --- a/src/filehandler.rs +++ b/src/filehandler.rs @@ -1,9 +1,15 @@ use rust_htslib::bam::{Read, Reader}; -use std::io::{BufWriter, Write}; +use itertools::Itertools; +use std::io::{BufReader, BufWriter, Write}; +use std::io::prelude::*; use std::fs::File; -use bigtools::{BigWigWrite, Value}; +use std::path::Path; +use bigtools::{BigWigRead, BigWigWrite, Value}; use bigtools::beddata::BedParserStreamingIterator; use std::collections::HashMap; +use crate::computematrix::Scalingregions; +use flate2::write::GzEncoder; +use flate2::Compression; pub fn bam_ispaired(bam_ifile: &str) -> bool { let mut bam = Reader::from_path(bam_ifile).unwrap(); @@ -41,4 +47,227 @@ pub fn write_file(ofile: &str, filetype: &str, bg: Vec<(String, u64, u64, f64)>, println!("Start writer"); let _ = writer.write(vals, runtime); } +} + +pub fn read_bedfiles(bed_files: &Vec) -> (Vec<(String, u64, u64, String, String, String)>, HashMap) { + // read all bedfiles in a Vec of strings (filepaths) + // returns a vec of tuples, with last entry being the filepath (without extension and dirs ~= 'smartLabels') + let mut regionsizes: HashMap = HashMap::new(); + let mut regions: Vec<(String, u64, u64, String, String, String)> = Vec::new(); + for bed in bed_files { + let entryname = Path::new(bed) + .file_stem() + .unwrap() + .to_string_lossy() + .into_owned(); + + let bedfile = BufReader::new(File::open(bed).unwrap()); + let mut entries: u64 = 0; + for line in bedfile.lines() { + let line = line.unwrap(); + let fields: Vec<&str> = line.split('\t').collect(); + // If score and strand are not provided, we set them to . + let field_3 = fields.get(3).unwrap_or(&".").to_string(); + let field_4 = fields.get(4).unwrap_or(&".").to_string(); + regions.push( + ( + fields[0].to_string(), //chrom + fields[1].parse().unwrap(), //start + fields[2].parse().unwrap(), //end + field_3, //score + field_4, //strand + entryname.to_string() //bedfile_name + ) + ); + entries += 1; + } + regionsizes.insert(entryname, entries); + } + return (regions, regionsizes); +} + +pub fn chrombounds_from_bw(bwfile: &str) -> HashMap { + // define chromsizes hashmap + let mut chromsizes: HashMap = HashMap::new(); + let bwf = File::open(bwfile).expect("Failed to open bw file."); + let mut reader = BigWigRead::open(bwf).unwrap(); + for chrom in reader.chroms() { + chromsizes.insert(chrom.name.clone(), chrom.length as u64); + } + chromsizes +} + +pub fn bwintervals( + bwfile: &str, + regions: &Vec<(String, u64, u64, String, String, String)>, + slopregions: &Vec>, + scale_regions: &Scalingregions +) -> Vec> { + // For a given bw file, a vector of slop regions (where every vec entry is a vec of (start, end) tuples) + // return a vector with for every region a vector of f64. + + // Make sure regions and slopregions are of equal length + assert_eq!(regions.len(), slopregions.len()); + + // Define return vector, set up bw reader. + let mut bwvals: Vec> = Vec::new(); + let bwf = File::open(bwfile).expect("Failed to open bw file."); + let mut reader = BigWigRead::open(bwf).unwrap(); + + // Iterate over regions and slopregions synchronously. + for (sls, (chrom, _, _, _, _, _)) in slopregions.iter().zip(regions.iter()) { + let mut bwval: Vec = Vec::new(); + // get 'min and max' to query + let (min, max) = sls + .iter() + .flat_map(|(x, y)| vec![*x, *y]) + .filter(|&value| value != 0) + .minmax() + .into_option() + .expect("Vec is empty."); + let binvals = reader.get_interval(chrom, min as u32, max as u32).unwrap(); + // since binvals (can) be over binsizes, we expand them to bp and push them to a hashmap + let mut bwhash: HashMap = HashMap::new(); + for interval in binvals { + let interval = interval.unwrap(); + let start = interval.start as u64; + let end = interval.end as u64; + let val = interval.value as f64; + bwhash.extend((start..end).map(|bp| (bp, val))); + } + for (start, end) in sls { + if start == end && *end == 0 { + if scale_regions.missingdata_as_zero { + bwval.push(0.0); + } else { + bwval.push(std::f64::NAN); + } + } else { + // Get values from the hashmap + let mean: f64 = (*start..*end) + .filter_map(|bp| bwhash.get(&bp)) + .copied() + .sum::() + / (end - start) as f64; + bwval.push(mean); + } + } + // Make sure bwval is of expected length. + assert_eq!(bwval.len(), scale_regions.cols_expected / scale_regions.bwfiles); + bwvals.push(bwval); + } + bwvals +} + +pub fn header_matrix(scale_regions: &Scalingregions, regionsizes: HashMap) -> String { + // Create the header for the matrix. + // This is quite ugly, but this is mainly because we need to accomodate delta bwfiles. + let mut headstr = String::new(); + headstr.push_str("@{"); + headstr.push_str( + &format!("\"upstream\":[{}],", (0..scale_regions.bwfiles).map(|_| scale_regions.upstream).collect::>().into_iter().join(",")) + ); + headstr.push_str( + &format!("\"downstream\":[{}],", (0..scale_regions.bwfiles).map(|_| scale_regions.downstream).collect::>().into_iter().join(",")) + ); + headstr.push_str( + &format!("\"body\":[{}],", (0..scale_regions.bwfiles).map(|_| scale_regions.regionbodylength).collect::>().into_iter().join(",")) + ); + headstr.push_str( + &format!("\"bin size\":[{}],", (0..scale_regions.bwfiles).map(|_| scale_regions.binsize).collect::>().into_iter().join(",")) + ); + headstr.push_str( + &format!("\"ref point\":[\"{}\"],", (0..scale_regions.bwfiles).map(|_| scale_regions.referencepoint.clone()).collect::>().into_iter().join("\",\"")) + ); + headstr.push_str( + &format!("\"verbose\":{},", scale_regions.verbose) + ); + headstr.push_str( + &format!("\"bin avg type\":\"{}\",", scale_regions.avgtype) + ); + headstr.push_str( + &format!("\"missing data as zero\":{},", scale_regions.missingdata_as_zero) + ); + // Unimplemented arguments, but they need to be present in header for now anyway. + headstr.push_str( + "\"min threshold\":null,\"max threshold\":null,\"scale\":1,\"skip zeros\":false,\"nan after end\":false," + ); + headstr.push_str( + &format!("\"proc number\":{},", scale_regions.proc_number) + ); + // Unimplemented for now... + headstr.push_str( + "\"sort regions\":\"keep\",\"sort using\":\"mean\"," + ); + headstr.push_str( + &format!("\"unscaled 5 prime\":[{}],", (0..scale_regions.bwfiles).map(|_| scale_regions.unscaled5prime).collect::>().into_iter().join(",")) + ); + headstr.push_str( + &format!("\"unscaled 3 prime\":[{}],", (0..scale_regions.bwfiles).map(|_| scale_regions.unscaled3prime).collect::>().into_iter().join(",")) + ); + headstr.push_str( + &format!("\"group_labels\":[\"{}\"],", scale_regions.bedlabels.join("\",\"")) + ); + // Get cumulative sizes of regions + let mut groupbounds: Vec = Vec::new(); + groupbounds.push(0); + let mut cumsum: u64 = 0; + for bedlabel in scale_regions.bedlabels.iter() { + cumsum += regionsizes.get(bedlabel).unwrap(); + groupbounds.push(cumsum); + } + let groupbounds = format!("{}", groupbounds.iter() + .map(|&x| x.to_string()) + .collect::>() + .join(",")); + + headstr.push_str( + &format!("\"group_boundaries\":[{}],", groupbounds) + ); + // Get cumulative sizes for sample boundaries + let colsize_per_sample = scale_regions.cols_expected / scale_regions.bwfiles; + let mut samplebounds: Vec = Vec::new(); + samplebounds.push(0); + let mut cumsum: u64 = 0; + for i in 0..scale_regions.bwfiles { + cumsum += colsize_per_sample as u64; + samplebounds.push(cumsum); + } + let samplebounds = format!("{}", samplebounds.iter() + .map(|&x| x.to_string()) + .collect::>() + .join(",")); + + headstr.push_str( + &format!("\"sample_boundaries\":[{}]", samplebounds) + ); + headstr.push_str( + "}\n" + ); + headstr +} + +pub fn write_matrix(header: String, mat: Vec>, ofile: &str, regions: Vec<(String, u64, u64, String, String, String)>) { + println!("Writing out matrix to file."); + // Write out the matrix to a compressed file. + let omat = File::create(ofile).unwrap(); + let mut encoder = GzEncoder::new(omat, Compression::default()); + encoder.write_all(header.as_bytes()).unwrap(); + // Final check to make sure our regions and mat iter are of equal length. + assert_eq!(regions.len(), mat.len()); + for (region, row) in regions.into_iter().zip(mat.into_iter()) { + let mut writerow = format!( + "{}\t{}\t{}\t{}\t{}\t", + region.0, // String field + region.1.to_string(), // u64 field 1 converted to string + region.2.to_string(), // u64 field 2 converted to string + region.3, // String field + region.4, // String field + ); + writerow.push_str( + &row.iter().map(|x| x.to_string()).collect::>().join("\t") + ); + writerow.push_str("\n"); + encoder.write_all(writerow.as_bytes()).unwrap(); + } } \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 386f4312..a323de10 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ use pyo3::prelude::*; mod bamcoverage; mod bamcompare; +mod computematrix; mod covcalc; mod alignmentsieve; mod filehandler; @@ -11,5 +12,6 @@ mod calc; fn hp(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_function(wrap_pyfunction!(bamcoverage::r_bamcoverage, m)?)?; m.add_function(wrap_pyfunction!(bamcompare::r_bamcompare, m)?)?; + m.add_function(wrap_pyfunction!(computematrix::r_computematrix, m)?)?; Ok(()) }