This repository has been archived by the owner on Nov 9, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #532 from broadinstitute/db_gc
GC bias correction. Closes #226.
- Loading branch information
Showing
10 changed files
with
445 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,7 @@ | |
* | ||
* @author Valentin Ruano-Rubio <[email protected]> | ||
*/ | ||
final class TargetAnnotationCollection { | ||
public final class TargetAnnotationCollection { | ||
|
||
private final EnumMap<TargetAnnotation, String> values = new EnumMap<TargetAnnotation, String>(TargetAnnotation.class); | ||
|
||
|
98 changes: 98 additions & 0 deletions
98
src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBias.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
package org.broadinstitute.hellbender.tools.exome.gcbias; | ||
|
||
import org.broadinstitute.hellbender.cmdline.*; | ||
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup; | ||
import org.broadinstitute.hellbender.exceptions.UserException; | ||
import org.broadinstitute.hellbender.tools.exome.*; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
|
||
/** | ||
* Correct coverage for sample-specific GC bias effects as described in {@link GCCorrector}. | ||
* | ||
* Inputs are | ||
* 1. read counts file (format described in {@link ReadCountCollectionUtils}). Since this tool corrects | ||
* for multiplicative GC bias effects, these counts should not be in log space i.e. they should represent | ||
* raw coverage or proportional coverage upstream of tangent normalization. | ||
* 2. targets file containing GC content annotation as produced by {@link AnnotateTargets}. Every target in the input | ||
* read counts must be present in the targets file but the reverse need not be true. | ||
* | ||
* Output is a read counts file with the same targets (rows) and samples (columns) as the input, corrected for GC bias. | ||
* Coverage is represented by doubles in {@link ReadCountCollection}. | ||
* | ||
* @author David Benjamin <[email protected]> | ||
*/ | ||
@CommandLineProgramProperties( | ||
oneLineSummary = "Correct for per-sample GC bias effects.", | ||
summary = "Correct coverage in a read counts files by estimating per-sample bias as a function of target GC" + | ||
" content and dividing input coverage by these derived bias curves.", | ||
programGroup = CopyNumberProgramGroup.class | ||
) | ||
public class CorrectGCBias extends CommandLineProgram { | ||
public static final String INPUT_READ_COUNTS_FILE_LONG_NAME = StandardArgumentDefinitions.INPUT_LONG_NAME; | ||
public static final String INPUT_READ_COUNTS_FILE_SHORT_NAME = StandardArgumentDefinitions.INPUT_SHORT_NAME; | ||
|
||
public static final String OUTPUT_READ_COUNTS_FILE_LONG_NAME = StandardArgumentDefinitions.OUTPUT_LONG_NAME; | ||
public static final String OUTPUT_READ_COUNTS_FILE_SHORT_NAME = StandardArgumentDefinitions.OUTPUT_SHORT_NAME; | ||
|
||
@Argument( | ||
doc = "Read counts input file.", | ||
shortName = INPUT_READ_COUNTS_FILE_SHORT_NAME, | ||
fullName = INPUT_READ_COUNTS_FILE_LONG_NAME, | ||
optional = false | ||
) | ||
protected File inputReadCountsFile; | ||
|
||
@Argument( | ||
doc = "Output file of GC-corrected read counts.", | ||
shortName = OUTPUT_READ_COUNTS_FILE_SHORT_NAME, | ||
fullName = OUTPUT_READ_COUNTS_FILE_LONG_NAME, | ||
optional = false | ||
) | ||
protected File outputReadCountsFile; | ||
|
||
// arguments for GC-annotated targets | ||
@ArgumentCollection | ||
protected TargetArgumentCollection targetArguments = new TargetArgumentCollection(); | ||
|
||
@Override | ||
protected Object doWork() { | ||
final TargetCollection<Target> targets = targetArguments.readTargetCollection(false); | ||
final ReadCountCollection inputCounts = readInputCounts(targets); | ||
final double[] gcContentByTarget = gcContentsOfTargets(inputCounts, targets); | ||
final ReadCountCollection outputCounts = GCCorrector.correctCoverage(inputCounts, gcContentByTarget); | ||
writeOutputCounts(outputCounts); | ||
return "Success"; | ||
} | ||
|
||
private ReadCountCollection readInputCounts(final TargetCollection<Target> targets) { | ||
final ReadCountCollection inputCounts; | ||
try { | ||
inputCounts = ReadCountCollectionUtils.parse(inputReadCountsFile, targets, false); | ||
} catch (final IOException ex) { | ||
throw new UserException.CouldNotReadInputFile(inputReadCountsFile, ex.getMessage(), ex); | ||
} | ||
return inputCounts; | ||
} | ||
|
||
private void writeOutputCounts(final ReadCountCollection outputCounts) { | ||
try { | ||
ReadCountCollectionUtils.write(outputReadCountsFile, outputCounts); | ||
} catch (final IOException ex) { | ||
throw new UserException.CouldNotCreateOutputFile(outputReadCountsFile, ex); | ||
} | ||
} | ||
|
||
private double[] gcContentsOfTargets(final ReadCountCollection inputCounts, final TargetCollection<Target> targets) { | ||
final List<Target> annotatedTargets = inputCounts.targets().stream().map(t -> targets.target(t.getName())).collect(Collectors.toList()); | ||
if (!annotatedTargets.stream().allMatch(t -> t.getAnnotations().hasAnnotation(TargetAnnotation.GC_CONTENT))) { | ||
throw new UserException.BadInput("At least one target lacks a GC annotation."); | ||
} | ||
return annotatedTargets.stream().mapToDouble(t -> t.getAnnotations().getDouble(TargetAnnotation.GC_CONTENT)).toArray(); | ||
} | ||
|
||
|
||
} |
125 changes: 125 additions & 0 deletions
125
src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/GCCorrector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
package org.broadinstitute.hellbender.tools.exome.gcbias; | ||
|
||
import org.apache.commons.math3.linear.ArrayRealVector; | ||
import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor; | ||
import org.apache.commons.math3.linear.RealMatrix; | ||
import org.apache.commons.math3.linear.RealVector; | ||
import org.apache.commons.math3.stat.descriptive.rank.Median; | ||
import org.broadinstitute.hellbender.tools.exome.ReadCountCollection; | ||
import org.broadinstitute.hellbender.utils.Utils; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.IntStream; | ||
|
||
/** | ||
* Learn multiplicative correction factors as a function of GC content from coverage vs. GC data. Basically, learn a | ||
* regression curve of coverage vs. GC, and divide by that curve to get GC-corrected coverage. | ||
* | ||
* Our regression curve is obtained by filling GC content bins of width 0.01 with the coverages of targets corresponding to each GC | ||
* and taking the median to get a robust estimate of the curve. In order to smooth out bins with few data (i.e. extreme | ||
* GC values that occur rarely) we then convolve these medians with an exponential kernel. | ||
* | ||
* @author David Benjamin <[email protected]> | ||
*/ | ||
class GCCorrector { | ||
// GC bins are 0%, 1% . . . 100% | ||
private final static int NUMBER_OF_GC_BINS = 101; | ||
|
||
// scale (in units of GCcontent from 0 to 1) over which gc bias correlation decays | ||
// i.e. the bias at GC content = 0.3 and at 0.2 are correlated ~exp(-0.1/correlationLength) | ||
// this is effectively a smoothness parameter used for regularizing the GC bias estimate for | ||
// GC content bins that have few targets) | ||
private static final double correlationLength = 0.02; | ||
private static final double correlationDecayRatePerBin = 1.0 / (correlationLength * NUMBER_OF_GC_BINS); | ||
|
||
// multiply by these to get a GC correction as a function of GC | ||
private final double[] gcCorrectionFactors; | ||
|
||
//Apache commons median doesn't work on empty arrays; this value is a placeholder to avoid exceptions | ||
private static final double DUMMY_VALUE_NEVER_USED = 1.0; | ||
|
||
/** | ||
* Learn multiplicative correction factors as a function of GC from coverage vs. GC data. Basically, learn a | ||
* regression curve of coverage vs. GC in order to divide by that curve later. | ||
* | ||
* @param gcContents GC content (from 0.0 to 1.0) of targets in {@code coverage} | ||
* @param coverage raw of proportional coverage | ||
*/ | ||
public GCCorrector(final double[] gcContents, final RealVector coverage) { | ||
Utils.nonNull(gcContents); | ||
Utils.nonNull(coverage); | ||
Utils.validateArg(gcContents.length > 0, "must have at lest one datum"); | ||
Utils.validateArg(gcContents.length == coverage.getDimension(), "must have one gc value per coverage."); | ||
|
||
final List<List<Double>> coveragesByGC = new ArrayList<>(NUMBER_OF_GC_BINS); | ||
IntStream.range(0, NUMBER_OF_GC_BINS).forEach(n -> coveragesByGC.add(new ArrayList<>())); | ||
IntStream.range(0, gcContents.length).forEach(n -> coveragesByGC.get(gcContentToBinIndex(gcContents[n])).add(coverage.getEntry(n))); | ||
gcCorrectionFactors = calculateCorrectionFactors(coveragesByGC); | ||
} | ||
|
||
/** | ||
* As described above, calculate medians of each GC bin and convolve with an exponential kernel. | ||
* | ||
* @param coveragesByGC list of coverages for each GC bin | ||
* @return multiplicative correction factors for each GC bin | ||
*/ | ||
private double[] calculateCorrectionFactors(final List<List<Double>> coveragesByGC) { | ||
final RealVector medians = new ArrayRealVector(coveragesByGC.stream().mapToDouble(GCCorrector::medianOrDefault).toArray()); | ||
return IntStream.range(0, NUMBER_OF_GC_BINS).mapToDouble(bin -> { | ||
final RealVector weights = new ArrayRealVector(IntStream.range(0, NUMBER_OF_GC_BINS) | ||
.mapToDouble(n -> coveragesByGC.get(n).size() * Math.exp(-Math.abs(bin - n) * correlationDecayRatePerBin)).toArray()); | ||
return weights.dotProduct(medians) / weights.getL1Norm(); | ||
}).map(x -> 1/x).toArray(); | ||
} | ||
|
||
/** | ||
* | ||
* @param inputCounts raw coverage before GC correction | ||
* @param gcContentByTarget array of gc contents, one per target of the input | ||
* @return GC-corrected coverage | ||
*/ | ||
public static ReadCountCollection correctCoverage(final ReadCountCollection inputCounts, final double[] gcContentByTarget) { | ||
// each column (sample) has its own GC bias curve, hence its own GC corrector | ||
final List<GCCorrector> gcCorrectors = IntStream.range(0, inputCounts.columnNames().size()) | ||
.mapToObj(n -> new GCCorrector(gcContentByTarget, inputCounts.counts().getColumnVector(n))).collect(Collectors.toList()); | ||
|
||
// gc correct a copy of the input counts in-place | ||
final RealMatrix correctedCounts = inputCounts.counts().copy(); | ||
correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { | ||
@Override | ||
public double visit(int target, int column, double coverage) { | ||
return gcCorrectors.get(column).correctedCoverage(coverage, gcContentByTarget[target]); | ||
} | ||
}); | ||
|
||
// we would like the average correction factor to be 1.0 in the sense that average coverage before and after | ||
// correction should be equal | ||
final double[] columnNormalizationFactors = IntStream.range(0, inputCounts.columnNames().size()) | ||
.mapToDouble(c -> inputCounts.counts().getColumnVector(c).getL1Norm() / correctedCounts.getColumnVector(c).getL1Norm()).toArray(); | ||
correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { | ||
@Override | ||
public double visit(int target, int column, double coverage) { | ||
return coverage * columnNormalizationFactors[column]; | ||
} | ||
}); | ||
|
||
return new ReadCountCollection(inputCounts.targets(), inputCounts.columnNames(), correctedCounts); | ||
} | ||
|
||
private double correctedCoverage(final double coverage, final double gcContent) { | ||
return gcCorrectionFactors[gcContentToBinIndex(gcContent)] * coverage; | ||
} | ||
|
||
// return a median of coverages or dummy default value if no coverage exists at this gc bin | ||
// this default is never used because empty bins get zero weight in {@code calculateCorrectionFactors} | ||
private static double medianOrDefault(final List<Double> list) { | ||
return list.size() > 0 ? new Median().evaluate(list.stream().mapToDouble(d->d).toArray()) : DUMMY_VALUE_NEVER_USED; | ||
} | ||
|
||
private static int gcContentToBinIndex(final double gcContent) { | ||
return (int) Math.round(gcContent * (NUMBER_OF_GC_BINS-1)); | ||
} | ||
} |
77 changes: 77 additions & 0 deletions
77
src/test/java/org/broadinstitute/hellbender/fakedata/GCBiasSimulatedData.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
package org.broadinstitute.hellbender.fakedata; | ||
|
||
import org.apache.commons.lang3.tuple.ImmutablePair; | ||
import org.apache.commons.lang3.tuple.Pair; | ||
import org.apache.commons.math3.linear.Array2DRowRealMatrix; | ||
import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor; | ||
import org.apache.commons.math3.linear.RealMatrix; | ||
import org.broadinstitute.hellbender.tools.exome.*; | ||
import org.broadinstitute.hellbender.utils.SimpleInterval; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Random; | ||
import java.util.function.Function; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.IntStream; | ||
|
||
/** | ||
* GC bias data consists of a {@link ReadCountCollection} and an array of gc contents | ||
* in order of the read counts' targets. | ||
* | ||
* @author David Benjamin <[email protected]> | ||
*/ | ||
public class GCBiasSimulatedData { | ||
|
||
// a reasonable default GC bias curve | ||
private static Function<Double, Double> QUADRATIC_GC_BIAS_CURVE = gc -> 0.5 + 2*gc*(1-gc); | ||
|
||
// visible for the integration test | ||
public static Pair<ReadCountCollection, double[]> simulatedData(final int numTargets, final int numSamples) { | ||
final List<Target> phonyTargets = SimulatedTargets.phonyTargets(numTargets); | ||
final List<String> phonySamples = SimulatedSamples.phonySamples(numSamples); | ||
|
||
final Random random = new Random(13); | ||
final double[] gcContentByTarget = IntStream.range(0, numTargets) | ||
.mapToDouble(n -> 0.5 + 0.2*random.nextGaussian()) | ||
.map(x -> Math.min(x,0.95)).map(x -> Math.max(x,0.05)).toArray(); | ||
final double[] gcBiasByTarget = Arrays.stream(gcContentByTarget).map(QUADRATIC_GC_BIAS_CURVE::apply).toArray(); | ||
|
||
// model mainly GC bias with a small random amount of non-GC bias | ||
// thus noise after GC correction should be nearly zero | ||
final RealMatrix counts = new Array2DRowRealMatrix(numTargets, numSamples); | ||
counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { | ||
@Override | ||
public double visit(final int target, final int column, final double value) { | ||
return gcBiasByTarget[target]*(1.0 + 0.01*random.nextDouble()); | ||
} | ||
}); | ||
final ReadCountCollection rcc = new ReadCountCollection(phonyTargets, phonySamples, counts); | ||
return new ImmutablePair<>(rcc, gcContentByTarget); | ||
} | ||
|
||
/** | ||
* | ||
* @param readCountsFile A simulated read counts file with GC bias effects | ||
* @param targetsFile A simulated targets file with GC content annotation | ||
*/ | ||
public static void makeGCBiasInputFiles(final Pair<ReadCountCollection, double[]> data, | ||
final File readCountsFile, final File targetsFile) throws IOException { | ||
final ReadCountCollection inputCounts = data.getLeft(); | ||
final double[] gcContentByTarget = data.getRight(); | ||
ReadCountCollectionUtils.write(readCountsFile, inputCounts); | ||
|
||
final TargetTableWriter writer = new TargetTableWriter(targetsFile, Collections.singleton(TargetAnnotation.GC_CONTENT)); | ||
for (int i = 0; i < gcContentByTarget.length; i++) { | ||
final Target unannotatedTarget = inputCounts.records().get(i).getTarget(); | ||
final TargetAnnotationCollection annotations = new TargetAnnotationCollection(); | ||
annotations.put(TargetAnnotation.GC_CONTENT, Double.toString(gcContentByTarget[i])); | ||
final Target target = new Target(unannotatedTarget.getName(), unannotatedTarget.getInterval(), annotations); | ||
writer.writeRecord(target); | ||
} | ||
writer.close(); | ||
} | ||
} |
15 changes: 15 additions & 0 deletions
15
src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedSamples.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
package org.broadinstitute.hellbender.fakedata; | ||
|
||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.IntStream; | ||
|
||
/** | ||
* @author David Benjamin <[email protected]> | ||
*/ | ||
public class SimulatedSamples { | ||
public static List<String> phonySamples(final int numSamples) { | ||
return IntStream.range(0, numSamples) | ||
.mapToObj(n -> String.format("Sample%d", n)).collect(Collectors.toList()); | ||
} | ||
} |
23 changes: 23 additions & 0 deletions
23
src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedTargets.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
package org.broadinstitute.hellbender.fakedata; | ||
|
||
import org.broadinstitute.hellbender.tools.exome.Target; | ||
import org.broadinstitute.hellbender.utils.SimpleInterval; | ||
|
||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.IntStream; | ||
|
||
/** | ||
* @author David Benjamin <[email protected]> | ||
*/ | ||
public class SimulatedTargets { | ||
|
||
// phony targets whose intervals don't really matter | ||
public static List<Target> phonyTargets(final int numTargets) { | ||
return IntStream.range(0, numTargets).mapToObj(n -> { | ||
final String name = String.format("Target%d", n); | ||
final SimpleInterval interval = new SimpleInterval("chr", 2*n + 1, 2*n + 2); | ||
return new Target(name, interval); | ||
}).collect(Collectors.toList()); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.