Skip to content
This repository has been archived by the owner on Nov 9, 2019. It is now read-only.

Commit

Permalink
Merge pull request #532 from broadinstitute/db_gc
Browse files Browse the repository at this point in the history
GC bias correction. Closes #226.
  • Loading branch information
LeeTL1220 committed Jun 3, 2016
2 parents e73b005 + ea0214f commit 8643640
Show file tree
Hide file tree
Showing 10 changed files with 445 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,8 @@ private TargetCollection<? extends BEDFeature> readTargetCollection(final File t
* @param <E> the element type of {@code exonCollection}, it must be a {@link BEDFeature}.
* @return never {@code null}.
* @throws UserException.CouldNotReadInputFile if there was some problem
* trying to read from {@code readCountsFile}.
* @throws UserException.BadInput if there is some format issue with the input file {@code readCountsFile},
* trying to read from {@code inputReadCountsFile}.
* @throws UserException.BadInput if there is some format issue with the input file {@code inputReadCountsFile},
* or it does not contain target names and {@code exonCollection} is {@code null}
* or the input read counts contain more thanb one sample.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
*
* @author Valentin Ruano-Rubio &lt;[email protected]&gt;
*/
final class TargetAnnotationCollection {
public final class TargetAnnotationCollection {

private final EnumMap<TargetAnnotation, String> values = new EnumMap<TargetAnnotation, String>(TargetAnnotation.class);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package org.broadinstitute.hellbender.tools.exome.gcbias;

import org.broadinstitute.hellbender.cmdline.*;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.exome.*;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;

/**
* Correct coverage for sample-specific GC bias effects as described in {@link GCCorrector}.
*
* Inputs are
* 1. read counts file (format described in {@link ReadCountCollectionUtils}). Since this tool corrects
* for multiplicative GC bias effects, these counts should not be in log space i.e. they should represent
* raw coverage or proportional coverage upstream of tangent normalization.
* 2. targets file containing GC content annotation as produced by {@link AnnotateTargets}. Every target in the input
* read counts must be present in the targets file but the reverse need not be true.
*
* Output is a read counts file with the same targets (rows) and samples (columns) as the input, corrected for GC bias.
* Coverage is represented by doubles in {@link ReadCountCollection}.
*
* @author David Benjamin &lt;[email protected]&gt;
*/
@CommandLineProgramProperties(
oneLineSummary = "Correct for per-sample GC bias effects.",
summary = "Correct coverage in a read counts files by estimating per-sample bias as a function of target GC" +
" content and dividing input coverage by these derived bias curves.",
programGroup = CopyNumberProgramGroup.class
)
public class CorrectGCBias extends CommandLineProgram {
public static final String INPUT_READ_COUNTS_FILE_LONG_NAME = StandardArgumentDefinitions.INPUT_LONG_NAME;
public static final String INPUT_READ_COUNTS_FILE_SHORT_NAME = StandardArgumentDefinitions.INPUT_SHORT_NAME;

public static final String OUTPUT_READ_COUNTS_FILE_LONG_NAME = StandardArgumentDefinitions.OUTPUT_LONG_NAME;
public static final String OUTPUT_READ_COUNTS_FILE_SHORT_NAME = StandardArgumentDefinitions.OUTPUT_SHORT_NAME;

@Argument(
doc = "Read counts input file.",
shortName = INPUT_READ_COUNTS_FILE_SHORT_NAME,
fullName = INPUT_READ_COUNTS_FILE_LONG_NAME,
optional = false
)
protected File inputReadCountsFile;

@Argument(
doc = "Output file of GC-corrected read counts.",
shortName = OUTPUT_READ_COUNTS_FILE_SHORT_NAME,
fullName = OUTPUT_READ_COUNTS_FILE_LONG_NAME,
optional = false
)
protected File outputReadCountsFile;

// arguments for GC-annotated targets
@ArgumentCollection
protected TargetArgumentCollection targetArguments = new TargetArgumentCollection();

@Override
protected Object doWork() {
final TargetCollection<Target> targets = targetArguments.readTargetCollection(false);
final ReadCountCollection inputCounts = readInputCounts(targets);
final double[] gcContentByTarget = gcContentsOfTargets(inputCounts, targets);
final ReadCountCollection outputCounts = GCCorrector.correctCoverage(inputCounts, gcContentByTarget);
writeOutputCounts(outputCounts);
return "Success";
}

private ReadCountCollection readInputCounts(final TargetCollection<Target> targets) {
final ReadCountCollection inputCounts;
try {
inputCounts = ReadCountCollectionUtils.parse(inputReadCountsFile, targets, false);
} catch (final IOException ex) {
throw new UserException.CouldNotReadInputFile(inputReadCountsFile, ex.getMessage(), ex);
}
return inputCounts;
}

private void writeOutputCounts(final ReadCountCollection outputCounts) {
try {
ReadCountCollectionUtils.write(outputReadCountsFile, outputCounts);
} catch (final IOException ex) {
throw new UserException.CouldNotCreateOutputFile(outputReadCountsFile, ex);
}
}

private double[] gcContentsOfTargets(final ReadCountCollection inputCounts, final TargetCollection<Target> targets) {
final List<Target> annotatedTargets = inputCounts.targets().stream().map(t -> targets.target(t.getName())).collect(Collectors.toList());
if (!annotatedTargets.stream().allMatch(t -> t.getAnnotations().hasAnnotation(TargetAnnotation.GC_CONTENT))) {
throw new UserException.BadInput("At least one target lacks a GC annotation.");
}
return annotatedTargets.stream().mapToDouble(t -> t.getAnnotations().getDouble(TargetAnnotation.GC_CONTENT)).toArray();
}


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package org.broadinstitute.hellbender.tools.exome.gcbias;

import org.apache.commons.math3.linear.ArrayRealVector;
import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor;
import org.apache.commons.math3.linear.RealMatrix;
import org.apache.commons.math3.linear.RealVector;
import org.apache.commons.math3.stat.descriptive.rank.Median;
import org.broadinstitute.hellbender.tools.exome.ReadCountCollection;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
* Learn multiplicative correction factors as a function of GC content from coverage vs. GC data. Basically, learn a
* regression curve of coverage vs. GC, and divide by that curve to get GC-corrected coverage.
*
* Our regression curve is obtained by filling GC content bins of width 0.01 with the coverages of targets corresponding to each GC
* and taking the median to get a robust estimate of the curve. In order to smooth out bins with few data (i.e. extreme
* GC values that occur rarely) we then convolve these medians with an exponential kernel.
*
* @author David Benjamin &lt;[email protected]&gt;
*/
class GCCorrector {
// GC bins are 0%, 1% . . . 100%
private final static int NUMBER_OF_GC_BINS = 101;

// scale (in units of GCcontent from 0 to 1) over which gc bias correlation decays
// i.e. the bias at GC content = 0.3 and at 0.2 are correlated ~exp(-0.1/correlationLength)
// this is effectively a smoothness parameter used for regularizing the GC bias estimate for
// GC content bins that have few targets)
private static final double correlationLength = 0.02;
private static final double correlationDecayRatePerBin = 1.0 / (correlationLength * NUMBER_OF_GC_BINS);

// multiply by these to get a GC correction as a function of GC
private final double[] gcCorrectionFactors;

//Apache commons median doesn't work on empty arrays; this value is a placeholder to avoid exceptions
private static final double DUMMY_VALUE_NEVER_USED = 1.0;

/**
* Learn multiplicative correction factors as a function of GC from coverage vs. GC data. Basically, learn a
* regression curve of coverage vs. GC in order to divide by that curve later.
*
* @param gcContents GC content (from 0.0 to 1.0) of targets in {@code coverage}
* @param coverage raw of proportional coverage
*/
public GCCorrector(final double[] gcContents, final RealVector coverage) {
Utils.nonNull(gcContents);
Utils.nonNull(coverage);
Utils.validateArg(gcContents.length > 0, "must have at lest one datum");
Utils.validateArg(gcContents.length == coverage.getDimension(), "must have one gc value per coverage.");

final List<List<Double>> coveragesByGC = new ArrayList<>(NUMBER_OF_GC_BINS);
IntStream.range(0, NUMBER_OF_GC_BINS).forEach(n -> coveragesByGC.add(new ArrayList<>()));
IntStream.range(0, gcContents.length).forEach(n -> coveragesByGC.get(gcContentToBinIndex(gcContents[n])).add(coverage.getEntry(n)));
gcCorrectionFactors = calculateCorrectionFactors(coveragesByGC);
}

/**
* As described above, calculate medians of each GC bin and convolve with an exponential kernel.
*
* @param coveragesByGC list of coverages for each GC bin
* @return multiplicative correction factors for each GC bin
*/
private double[] calculateCorrectionFactors(final List<List<Double>> coveragesByGC) {
final RealVector medians = new ArrayRealVector(coveragesByGC.stream().mapToDouble(GCCorrector::medianOrDefault).toArray());
return IntStream.range(0, NUMBER_OF_GC_BINS).mapToDouble(bin -> {
final RealVector weights = new ArrayRealVector(IntStream.range(0, NUMBER_OF_GC_BINS)
.mapToDouble(n -> coveragesByGC.get(n).size() * Math.exp(-Math.abs(bin - n) * correlationDecayRatePerBin)).toArray());
return weights.dotProduct(medians) / weights.getL1Norm();
}).map(x -> 1/x).toArray();
}

/**
*
* @param inputCounts raw coverage before GC correction
* @param gcContentByTarget array of gc contents, one per target of the input
* @return GC-corrected coverage
*/
public static ReadCountCollection correctCoverage(final ReadCountCollection inputCounts, final double[] gcContentByTarget) {
// each column (sample) has its own GC bias curve, hence its own GC corrector
final List<GCCorrector> gcCorrectors = IntStream.range(0, inputCounts.columnNames().size())
.mapToObj(n -> new GCCorrector(gcContentByTarget, inputCounts.counts().getColumnVector(n))).collect(Collectors.toList());

// gc correct a copy of the input counts in-place
final RealMatrix correctedCounts = inputCounts.counts().copy();
correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(int target, int column, double coverage) {
return gcCorrectors.get(column).correctedCoverage(coverage, gcContentByTarget[target]);
}
});

// we would like the average correction factor to be 1.0 in the sense that average coverage before and after
// correction should be equal
final double[] columnNormalizationFactors = IntStream.range(0, inputCounts.columnNames().size())
.mapToDouble(c -> inputCounts.counts().getColumnVector(c).getL1Norm() / correctedCounts.getColumnVector(c).getL1Norm()).toArray();
correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(int target, int column, double coverage) {
return coverage * columnNormalizationFactors[column];
}
});

return new ReadCountCollection(inputCounts.targets(), inputCounts.columnNames(), correctedCounts);
}

private double correctedCoverage(final double coverage, final double gcContent) {
return gcCorrectionFactors[gcContentToBinIndex(gcContent)] * coverage;
}

// return a median of coverages or dummy default value if no coverage exists at this gc bin
// this default is never used because empty bins get zero weight in {@code calculateCorrectionFactors}
private static double medianOrDefault(final List<Double> list) {
return list.size() > 0 ? new Median().evaluate(list.stream().mapToDouble(d->d).toArray()) : DUMMY_VALUE_NEVER_USED;
}

private static int gcContentToBinIndex(final double gcContent) {
return (int) Math.round(gcContent * (NUMBER_OF_GC_BINS-1));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package org.broadinstitute.hellbender.fakedata;

import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.linear.Array2DRowRealMatrix;
import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor;
import org.apache.commons.math3.linear.RealMatrix;
import org.broadinstitute.hellbender.tools.exome.*;
import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
* GC bias data consists of a {@link ReadCountCollection} and an array of gc contents
* in order of the read counts' targets.
*
* @author David Benjamin &lt;[email protected]&gt;
*/
public class GCBiasSimulatedData {

// a reasonable default GC bias curve
private static Function<Double, Double> QUADRATIC_GC_BIAS_CURVE = gc -> 0.5 + 2*gc*(1-gc);

// visible for the integration test
public static Pair<ReadCountCollection, double[]> simulatedData(final int numTargets, final int numSamples) {
final List<Target> phonyTargets = SimulatedTargets.phonyTargets(numTargets);
final List<String> phonySamples = SimulatedSamples.phonySamples(numSamples);

final Random random = new Random(13);
final double[] gcContentByTarget = IntStream.range(0, numTargets)
.mapToDouble(n -> 0.5 + 0.2*random.nextGaussian())
.map(x -> Math.min(x,0.95)).map(x -> Math.max(x,0.05)).toArray();
final double[] gcBiasByTarget = Arrays.stream(gcContentByTarget).map(QUADRATIC_GC_BIAS_CURVE::apply).toArray();

// model mainly GC bias with a small random amount of non-GC bias
// thus noise after GC correction should be nearly zero
final RealMatrix counts = new Array2DRowRealMatrix(numTargets, numSamples);
counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
@Override
public double visit(final int target, final int column, final double value) {
return gcBiasByTarget[target]*(1.0 + 0.01*random.nextDouble());
}
});
final ReadCountCollection rcc = new ReadCountCollection(phonyTargets, phonySamples, counts);
return new ImmutablePair<>(rcc, gcContentByTarget);
}

/**
*
* @param readCountsFile A simulated read counts file with GC bias effects
* @param targetsFile A simulated targets file with GC content annotation
*/
public static void makeGCBiasInputFiles(final Pair<ReadCountCollection, double[]> data,
final File readCountsFile, final File targetsFile) throws IOException {
final ReadCountCollection inputCounts = data.getLeft();
final double[] gcContentByTarget = data.getRight();
ReadCountCollectionUtils.write(readCountsFile, inputCounts);

final TargetTableWriter writer = new TargetTableWriter(targetsFile, Collections.singleton(TargetAnnotation.GC_CONTENT));
for (int i = 0; i < gcContentByTarget.length; i++) {
final Target unannotatedTarget = inputCounts.records().get(i).getTarget();
final TargetAnnotationCollection annotations = new TargetAnnotationCollection();
annotations.put(TargetAnnotation.GC_CONTENT, Double.toString(gcContentByTarget[i]));
final Target target = new Target(unannotatedTarget.getName(), unannotatedTarget.getInterval(), annotations);
writer.writeRecord(target);
}
writer.close();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package org.broadinstitute.hellbender.fakedata;

import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
* @author David Benjamin &lt;[email protected]&gt;
*/
public class SimulatedSamples {
public static List<String> phonySamples(final int numSamples) {
return IntStream.range(0, numSamples)
.mapToObj(n -> String.format("Sample%d", n)).collect(Collectors.toList());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package org.broadinstitute.hellbender.fakedata;

import org.broadinstitute.hellbender.tools.exome.Target;
import org.broadinstitute.hellbender.utils.SimpleInterval;

import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
* @author David Benjamin &lt;[email protected]&gt;
*/
public class SimulatedTargets {

// phony targets whose intervals don't really matter
public static List<Target> phonyTargets(final int numTargets) {
return IntStream.range(0, numTargets).mapToObj(n -> {
final String name = String.format("Target%d", n);
final SimpleInterval interval = new SimpleInterval("chr", 2*n + 1, 2*n + 2);
return new Target(name, interval);
}).collect(Collectors.toList());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.fakedata.SimulatedTargets;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.test.BaseTest;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
Expand Down Expand Up @@ -45,12 +46,7 @@ public class CombineReadCountsIntegrationTest extends CommandLineProgramTest {
@Test(expectedExceptions = UserException.class)
public void testEmptyInputFileListProvided() throws Exception {
@SuppressWarnings("serial")
final List<Target> phonyTargets = new ArrayList<Target>() {{
add(new Target("target_0",new SimpleInterval("1", 100, 200)));
add(new Target("target_1",new SimpleInterval("2", 100, 200)));
add(new Target("target_2",new SimpleInterval("3", 100, 200)));

}};
final List<Target> phonyTargets = SimulatedTargets.phonyTargets(3);
final List<File> inputFiles = Collections.emptyList();
final File targetFile = createTargetFile(phonyTargets);
final File inputListFile = createInputListFile(inputFiles);
Expand All @@ -67,12 +63,7 @@ public void testEmptyInputFileListProvided() throws Exception {
@Test(expectedExceptions = UserException.class)
public void testNoInputFileProvided() throws Exception {
@SuppressWarnings("serial")
final List<Target> phonyTargets = new ArrayList<Target>() {{
add(new Target("target_0",new SimpleInterval("1", 100, 200)));
add(new Target("target_1",new SimpleInterval("2", 100, 200)));
add(new Target("target_2",new SimpleInterval("3", 100, 200)));

}};
final List<Target> phonyTargets = SimulatedTargets.phonyTargets(3);
final List<File> inputFiles = Collections.emptyList();
final File targetFile = createTargetFile(phonyTargets);
try {
Expand Down
Loading

0 comments on commit 8643640

Please sign in to comment.