Skip to content

Commit

Permalink
code format for passing pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
Xu, Beisi authored and Xu, Beisi committed Jul 12, 2023
1 parent a7059c1 commit fb92d3c
Show file tree
Hide file tree
Showing 55 changed files with 9,543 additions and 5,934 deletions.
103 changes: 63 additions & 40 deletions deeptools/SES_scaleFactor.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,22 @@
from deeptools import bamHandler
import deeptools.countReadsPerBin as countR

old_settings = np.seterr(all='ignore')
old_settings = np.seterr(all="ignore")
debug = 0


def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
normalizationLength,
avg_method='median', blackListFileName=None, numberOfProcessors=1,
verbose=False, chrsToSkip=[], mappingStatsList=[]):
def estimateScaleFactor(
bamFilesList,
binLength,
numberOfSamples,
normalizationLength,
avg_method="median",
blackListFileName=None,
numberOfProcessors=1,
verbose=False,
chrsToSkip=[],
mappingStatsList=[],
):
r"""
Subdivides the genome into chunks to be analyzed in parallel
using several processors. The code handles the creation of
Expand Down Expand Up @@ -83,20 +91,28 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
else:
mappedReads = []
for fname in bamFilesList:
mappedReads.append(bamHandler.openBam(fname, returnStats=True, nThreads=numberOfProcessors)[1])

sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype='float64')

sizeFactorBasedOnMappedReads = sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads

cr = countR.CountReadsPerBin(bamFilesList,
binLength=binLength,
numberOfSamples=numberOfSamples,
extendReads=False,
blackListFileName=blackListFileName,
numberOfProcessors=numberOfProcessors,
verbose=verbose,
chrsToSkip=chrsToSkip)
mappedReads.append(
bamHandler.openBam(
fname, returnStats=True, nThreads=numberOfProcessors
)[1]
)

sizeFactorBasedOnMappedReads = np.array(mappedReads, dtype="float64")

sizeFactorBasedOnMappedReads = (
sizeFactorBasedOnMappedReads.min() / sizeFactorBasedOnMappedReads
)

cr = countR.CountReadsPerBin(
bamFilesList,
binLength=binLength,
numberOfSamples=numberOfSamples,
extendReads=False,
blackListFileName=blackListFileName,
numberOfProcessors=numberOfProcessors,
verbose=verbose,
chrsToSkip=chrsToSkip,
)

try:
num_reads_per_bin = cr.run()
Expand Down Expand Up @@ -127,7 +143,7 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
# Take a lower rank to move to a region with probably
# less peaks and more background.
maxIndex = int(maxIndex * 0.8)
while(maxIndex < len(p)):
while maxIndex < len(p):
# in rare cases the maxIndex maps to a zero value.
# In such cases, the next index is used until
# a non zero value appears.
Expand All @@ -136,8 +152,10 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
break
maxIndex += 1

meanSES = [np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]),
np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex])]
meanSES = [
np.mean(np.sort(num_reads_per_bin[0, :])[:maxIndex]),
np.mean(np.sort(num_reads_per_bin[1, :])[:maxIndex]),
]

# the maxIndex may be too close to the the signal regions
# so i take a more conservative approach by taking a close number
Expand All @@ -151,9 +169,9 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
mean = []
std = []
for values in num_reads_per_bin:
maxNumReads = (np.percentile(values, 90))
maxNumReads = np.percentile(values, 90)
if maxNumReads == 0:
maxNumReads = (np.percentile(values, 99))
maxNumReads = np.percentile(values, 99)
if maxNumReads == 0:
print("all genomic regions sampled from one ")
"of the bam files have no reads.\n"
Expand All @@ -163,33 +181,38 @@ def estimateScaleFactor(bamFilesList, binLength, numberOfSamples,
std.append(np.std(values))

mean = np.array(mean)
readsPerBin = mean if avg_method == 'mean' else median
readsPerBin = mean if avg_method == "mean" else median

if min(median) == 0:
idx_zero = [ix + 1 for ix, value in enumerate(median) if value == 0]
exit("\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n"
"Try selecting a larger sample size or a region with coverage\n".format(idx_zero))
exit(
"\n*ERROR*: The median coverage computed is zero for sample(s) #{}\n"
"Try selecting a larger sample size or a region with coverage\n".format(
idx_zero
)
)

sizeFactor = sizeFactorsSES
return {'size_factors': sizeFactor,
'size_factors_based_on_mapped_reads': sizeFactorBasedOnMappedReads,
'size_factors_SES': sizeFactorsSES,
'size_factors_based_on_mean': mean.min() / mean,
'size_factors_based_on_median': median.min() / median,
'mean': mean,
'meanSES': meanSES,
'median': median,
'reads_per_bin': readsPerBin,
'std': std,
'sites_sampled': sitesSampled}
return {
"size_factors": sizeFactor,
"size_factors_based_on_mapped_reads": sizeFactorBasedOnMappedReads,
"size_factors_SES": sizeFactorsSES,
"size_factors_based_on_mean": mean.min() / mean,
"size_factors_based_on_median": median.min() / median,
"mean": mean,
"meanSES": meanSES,
"median": median,
"reads_per_bin": readsPerBin,
"std": std,
"sites_sampled": sitesSampled,
}


class Tester(object):

def __init__(self):
self.root = os.path.dirname(os.path.abspath(__file__)) + "/test/test_data/"
self.bamFile1 = self.root + "testA.bam"
self.bamFile2 = self.root + "testB.bam"
global debug
debug = 0
self.chrom = '3R'
self.chrom = "3R"
3 changes: 1 addition & 2 deletions deeptools/_version.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@

# This file is originally generated from Git information by running 'setup.py
# version'. Distribution tarballs contain a pre-generated copy of this file.

__version__ = '3.5.1'
__version__ = "3.5.1"
Loading

0 comments on commit fb92d3c

Please sign in to comment.