Merge pull request #532 from broadinstitute/db_gc

GC bias correction. Closes #226.
broadinstitute · Jun 3, 2016 · 8643640 · 8643640
2 parents e73b005 + ea0214f
commit 8643640
Show file tree

Hide file tree

Showing 10 changed files with 445 additions and 15 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/exome/NormalizeSomaticReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/exome/NormalizeSomaticReadCounts.java
@@ -214,8 +214,8 @@ private TargetCollection<? extends BEDFeature> readTargetCollection(final File t
      * @param <E>            the element type of {@code exonCollection}, it must be a {@link BEDFeature}.
      * @return never {@code null}.
      * @throws UserException.CouldNotReadInputFile if there was some problem
-     *                                             trying to read from {@code readCountsFile}.
-     * @throws UserException.BadInput              if there is some format issue with the input file {@code readCountsFile},
+     *                                             trying to read from {@code inputReadCountsFile}.
+     * @throws UserException.BadInput              if there is some format issue with the input file {@code inputReadCountsFile},
      *                                             or it does not contain target names and {@code exonCollection} is {@code null}
      *                                             or the input read counts contain more thanb one sample.
      */

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/exome/TargetAnnotationCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/exome/TargetAnnotationCollection.java
@@ -9,7 +9,7 @@
  *
  * @author Valentin Ruano-Rubio &lt;[email protected]&gt;
  */
-final class TargetAnnotationCollection {
+public final class TargetAnnotationCollection {
 
     private final EnumMap<TargetAnnotation, String> values = new EnumMap<TargetAnnotation, String>(TargetAnnotation.class);
 

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBias.java b/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBias.java
@@ -0,0 +1,98 @@
+package org.broadinstitute.hellbender.tools.exome.gcbias;
+
+import org.broadinstitute.hellbender.cmdline.*;
+import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.exome.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+import java.util.stream.Collectors;
+
+/**
+ * Correct coverage for sample-specific GC bias effects as described in {@link GCCorrector}.
+ *
+ * Inputs are
+ * 1. read counts file (format described in {@link ReadCountCollectionUtils}).  Since this tool corrects
+ *    for multiplicative GC bias effects, these counts should not be in log space i.e. they should represent
+ *    raw coverage or proportional coverage upstream of tangent normalization.
+ * 2. targets file containing GC content annotation as produced by {@link AnnotateTargets}.  Every target in the input
+ *    read counts must be present in the targets file but the reverse need not be true.
+ *
+ * Output is a read counts file with the same targets (rows) and samples (columns) as the input, corrected for GC bias.
+ * Coverage is represented by doubles in {@link ReadCountCollection}.
+ *
+ * @author David Benjamin &lt;[email protected]&gt;
+ */
+@CommandLineProgramProperties(
+        oneLineSummary = "Correct for per-sample GC bias effects.",
+        summary = "Correct coverage in a read counts files by estimating per-sample bias as a function of target GC" +
+                " content and dividing input coverage by these derived bias curves.",
+        programGroup = CopyNumberProgramGroup.class
+)
+public class CorrectGCBias extends CommandLineProgram {
+    public static final String INPUT_READ_COUNTS_FILE_LONG_NAME = StandardArgumentDefinitions.INPUT_LONG_NAME;
+    public static final String INPUT_READ_COUNTS_FILE_SHORT_NAME = StandardArgumentDefinitions.INPUT_SHORT_NAME;
+
+    public static final String OUTPUT_READ_COUNTS_FILE_LONG_NAME = StandardArgumentDefinitions.OUTPUT_LONG_NAME;
+    public static final String OUTPUT_READ_COUNTS_FILE_SHORT_NAME = StandardArgumentDefinitions.OUTPUT_SHORT_NAME;
+
+    @Argument(
+            doc = "Read counts input file.",
+            shortName = INPUT_READ_COUNTS_FILE_SHORT_NAME,
+            fullName = INPUT_READ_COUNTS_FILE_LONG_NAME,
+            optional = false
+    )
+    protected File inputReadCountsFile;
+
+    @Argument(
+            doc = "Output file of GC-corrected read counts.",
+            shortName = OUTPUT_READ_COUNTS_FILE_SHORT_NAME,
+            fullName = OUTPUT_READ_COUNTS_FILE_LONG_NAME,
+            optional = false
+    )
+    protected File outputReadCountsFile;
+
+    // arguments for GC-annotated targets
+    @ArgumentCollection
+    protected TargetArgumentCollection targetArguments = new TargetArgumentCollection();
+
+    @Override
+    protected Object doWork() {
+        final TargetCollection<Target> targets = targetArguments.readTargetCollection(false);
+        final ReadCountCollection inputCounts = readInputCounts(targets);
+        final double[] gcContentByTarget = gcContentsOfTargets(inputCounts, targets);
+        final ReadCountCollection outputCounts = GCCorrector.correctCoverage(inputCounts, gcContentByTarget);
+        writeOutputCounts(outputCounts);
+        return "Success";
+    }
+
+    private ReadCountCollection readInputCounts(final TargetCollection<Target> targets) {
+        final ReadCountCollection inputCounts;
+        try {
+            inputCounts = ReadCountCollectionUtils.parse(inputReadCountsFile, targets, false);
+        } catch (final IOException ex) {
+            throw new UserException.CouldNotReadInputFile(inputReadCountsFile, ex.getMessage(), ex);
+        }
+        return inputCounts;
+    }
+
+    private void writeOutputCounts(final ReadCountCollection outputCounts) {
+        try {
+            ReadCountCollectionUtils.write(outputReadCountsFile, outputCounts);
+        } catch (final IOException ex) {
+            throw new UserException.CouldNotCreateOutputFile(outputReadCountsFile, ex);
+        }
+    }
+
+    private double[] gcContentsOfTargets(final ReadCountCollection inputCounts, final TargetCollection<Target> targets) {
+        final List<Target> annotatedTargets = inputCounts.targets().stream().map(t -> targets.target(t.getName())).collect(Collectors.toList());
+        if (!annotatedTargets.stream().allMatch(t -> t.getAnnotations().hasAnnotation(TargetAnnotation.GC_CONTENT))) {
+            throw new UserException.BadInput("At least one target lacks a GC annotation.");
+        }
+        return annotatedTargets.stream().mapToDouble(t -> t.getAnnotations().getDouble(TargetAnnotation.GC_CONTENT)).toArray();
+    }
+
+
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/GCCorrector.java b/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/GCCorrector.java
@@ -0,0 +1,125 @@
+package org.broadinstitute.hellbender.tools.exome.gcbias;
+
+import org.apache.commons.math3.linear.ArrayRealVector;
+import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor;
+import org.apache.commons.math3.linear.RealMatrix;
+import org.apache.commons.math3.linear.RealVector;
+import org.apache.commons.math3.stat.descriptive.rank.Median;
+import org.broadinstitute.hellbender.tools.exome.ReadCountCollection;
+import org.broadinstitute.hellbender.utils.Utils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * Learn multiplicative correction factors as a function of GC content from coverage vs. GC data.  Basically, learn a
+ * regression curve of coverage vs. GC, and divide by that curve to get GC-corrected coverage.
+ *
+ * Our regression curve is obtained by filling GC content bins of width 0.01 with the coverages of targets corresponding to each GC
+ * and taking the median to get a robust estimate of the curve.  In order to smooth out bins with few data (i.e. extreme
+ * GC values that occur rarely) we then convolve these medians with an exponential kernel.
+ *
+ * @author David Benjamin &lt;[email protected]&gt;
+ */
+class GCCorrector {
+    // GC bins are 0%, 1% . . . 100%
+    private final static int NUMBER_OF_GC_BINS = 101;
+
+    // scale (in units of GCcontent from 0 to 1) over which gc bias correlation decays
+    // i.e. the bias at GC content = 0.3 and at 0.2 are correlated ~exp(-0.1/correlationLength)
+    // this is effectively a smoothness parameter used for regularizing the GC bias estimate for
+    // GC content bins that have few targets)
+    private static final double correlationLength = 0.02;
+    private static final double correlationDecayRatePerBin = 1.0 / (correlationLength * NUMBER_OF_GC_BINS);
+
+    // multiply by these to get a GC correction as a function of GC
+    private final double[] gcCorrectionFactors;
+
+    //Apache commons median doesn't work on empty arrays; this value is a placeholder to avoid exceptions
+    private static final double DUMMY_VALUE_NEVER_USED = 1.0;
+
+    /**
+     * Learn multiplicative correction factors as a function of GC from coverage vs. GC data.  Basically, learn a
+     * regression curve of coverage vs. GC in order to divide by that curve later.
+     *
+     * @param gcContents GC content (from 0.0 to 1.0) of targets in {@code coverage}
+     * @param coverage raw of proportional coverage
+     */
+    public GCCorrector(final double[] gcContents, final RealVector coverage) {
+        Utils.nonNull(gcContents);
+        Utils.nonNull(coverage);
+        Utils.validateArg(gcContents.length > 0, "must have at lest one datum");
+        Utils.validateArg(gcContents.length == coverage.getDimension(), "must have one gc value per coverage.");
+
+        final List<List<Double>> coveragesByGC = new ArrayList<>(NUMBER_OF_GC_BINS);
+        IntStream.range(0, NUMBER_OF_GC_BINS).forEach(n -> coveragesByGC.add(new ArrayList<>()));
+        IntStream.range(0, gcContents.length).forEach(n -> coveragesByGC.get(gcContentToBinIndex(gcContents[n])).add(coverage.getEntry(n)));
+        gcCorrectionFactors = calculateCorrectionFactors(coveragesByGC);
+    }
+
+    /**
+     * As described above, calculate medians of each GC bin and convolve with an exponential kernel.
+     *
+     * @param coveragesByGC list of coverages for each GC bin
+     * @return multiplicative correction factors for each GC bin
+     */
+    private double[] calculateCorrectionFactors(final List<List<Double>> coveragesByGC) {
+        final RealVector medians = new ArrayRealVector(coveragesByGC.stream().mapToDouble(GCCorrector::medianOrDefault).toArray());
+        return IntStream.range(0, NUMBER_OF_GC_BINS).mapToDouble(bin -> {
+            final RealVector weights = new ArrayRealVector(IntStream.range(0, NUMBER_OF_GC_BINS)
+                    .mapToDouble(n -> coveragesByGC.get(n).size() * Math.exp(-Math.abs(bin - n) * correlationDecayRatePerBin)).toArray());
+            return weights.dotProduct(medians) / weights.getL1Norm();
+        }).map(x -> 1/x).toArray();
+    }
+
+    /**
+     *
+     * @param inputCounts raw coverage before GC correction
+     * @param gcContentByTarget      array of gc contents, one per target of the input
+     * @return              GC-corrected coverage
+     */
+    public static ReadCountCollection correctCoverage(final ReadCountCollection inputCounts, final double[] gcContentByTarget) {
+        // each column (sample) has its own GC bias curve, hence its own GC corrector
+        final List<GCCorrector> gcCorrectors = IntStream.range(0, inputCounts.columnNames().size())
+                .mapToObj(n -> new GCCorrector(gcContentByTarget, inputCounts.counts().getColumnVector(n))).collect(Collectors.toList());
+
+        // gc correct a copy of the input counts in-place
+        final RealMatrix correctedCounts = inputCounts.counts().copy();
+        correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
+            @Override
+            public double visit(int target, int column, double coverage) {
+                return gcCorrectors.get(column).correctedCoverage(coverage, gcContentByTarget[target]);
+            }
+        });
+
+        // we would like the average correction factor to be 1.0 in the sense that average coverage before and after
+        // correction should be equal
+        final double[] columnNormalizationFactors = IntStream.range(0, inputCounts.columnNames().size())
+                .mapToDouble(c -> inputCounts.counts().getColumnVector(c).getL1Norm() / correctedCounts.getColumnVector(c).getL1Norm()).toArray();
+        correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
+            @Override
+            public double visit(int target, int column, double coverage) {
+                return coverage * columnNormalizationFactors[column];
+            }
+        });
+
+        return new ReadCountCollection(inputCounts.targets(), inputCounts.columnNames(), correctedCounts);
+    }
+
+    private double correctedCoverage(final double coverage, final double gcContent) {
+        return gcCorrectionFactors[gcContentToBinIndex(gcContent)] * coverage;
+    }
+
+    // return a median of coverages or dummy default value if no coverage exists at this gc bin
+    // this default is never used because empty bins get zero weight in {@code calculateCorrectionFactors}
+    private static double medianOrDefault(final List<Double> list) {
+        return list.size() > 0 ? new Median().evaluate(list.stream().mapToDouble(d->d).toArray()) : DUMMY_VALUE_NEVER_USED;
+    }
+
+    private static int gcContentToBinIndex(final double gcContent) {
+        return (int) Math.round(gcContent * (NUMBER_OF_GC_BINS-1));
+    }
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/fakedata/GCBiasSimulatedData.java b/src/test/java/org/broadinstitute/hellbender/fakedata/GCBiasSimulatedData.java
@@ -0,0 +1,77 @@
+package org.broadinstitute.hellbender.fakedata;
+
+import org.apache.commons.lang3.tuple.ImmutablePair;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.math3.linear.Array2DRowRealMatrix;
+import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor;
+import org.apache.commons.math3.linear.RealMatrix;
+import org.broadinstitute.hellbender.tools.exome.*;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * GC bias data consists of a {@link ReadCountCollection} and an array of gc contents
+ * in order of the read counts' targets.
+ *
+ * @author David Benjamin &lt;[email protected]&gt;
+ */
+public class GCBiasSimulatedData {
+
+    // a reasonable default GC bias curve
+    private static Function<Double, Double> QUADRATIC_GC_BIAS_CURVE = gc -> 0.5 + 2*gc*(1-gc);
+
+    // visible for the integration test
+    public static Pair<ReadCountCollection, double[]> simulatedData(final int numTargets, final int numSamples) {
+        final List<Target> phonyTargets = SimulatedTargets.phonyTargets(numTargets);
+        final List<String> phonySamples = SimulatedSamples.phonySamples(numSamples);
+
+        final Random random = new Random(13);
+        final double[] gcContentByTarget = IntStream.range(0, numTargets)
+                .mapToDouble(n -> 0.5 + 0.2*random.nextGaussian())
+                .map(x -> Math.min(x,0.95)).map(x -> Math.max(x,0.05)).toArray();
+        final double[] gcBiasByTarget = Arrays.stream(gcContentByTarget).map(QUADRATIC_GC_BIAS_CURVE::apply).toArray();
+
+        // model mainly GC bias with a small random amount of non-GC bias
+        // thus noise after GC correction should be nearly zero
+        final RealMatrix counts = new Array2DRowRealMatrix(numTargets, numSamples);
+        counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() {
+            @Override
+            public double visit(final int target, final int column, final double value) {
+                return gcBiasByTarget[target]*(1.0 + 0.01*random.nextDouble());
+            }
+        });
+        final ReadCountCollection rcc = new ReadCountCollection(phonyTargets, phonySamples, counts);
+        return new ImmutablePair<>(rcc, gcContentByTarget);
+    }
+
+    /**
+     *
+     * @param readCountsFile    A simulated read counts file with GC bias effects
+     * @param targetsFile       A simulated targets file with GC content annotation
+     */
+    public static void makeGCBiasInputFiles(final Pair<ReadCountCollection, double[]> data,
+                                            final File readCountsFile, final File targetsFile) throws IOException {
+        final ReadCountCollection inputCounts = data.getLeft();
+        final double[] gcContentByTarget = data.getRight();
+        ReadCountCollectionUtils.write(readCountsFile, inputCounts);
+
+        final TargetTableWriter writer = new TargetTableWriter(targetsFile, Collections.singleton(TargetAnnotation.GC_CONTENT));
+        for (int i = 0; i < gcContentByTarget.length; i++) {
+            final Target unannotatedTarget = inputCounts.records().get(i).getTarget();
+            final TargetAnnotationCollection annotations = new TargetAnnotationCollection();
+            annotations.put(TargetAnnotation.GC_CONTENT, Double.toString(gcContentByTarget[i]));
+            final Target target = new Target(unannotatedTarget.getName(), unannotatedTarget.getInterval(), annotations);
+            writer.writeRecord(target);
+        }
+        writer.close();
+    }
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedSamples.java b/src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedSamples.java
@@ -0,0 +1,15 @@
+package org.broadinstitute.hellbender.fakedata;
+
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * @author David Benjamin &lt;[email protected]&gt;
+ */
+public class SimulatedSamples {
+    public static List<String> phonySamples(final int numSamples) {
+        return IntStream.range(0, numSamples)
+                .mapToObj(n -> String.format("Sample%d", n)).collect(Collectors.toList());
+    }
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedTargets.java b/src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedTargets.java
@@ -0,0 +1,23 @@
+package org.broadinstitute.hellbender.fakedata;
+
+import org.broadinstitute.hellbender.tools.exome.Target;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
+
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+/**
+ * @author David Benjamin &lt;[email protected]&gt;
+ */
+public class SimulatedTargets {
+
+    // phony targets whose intervals don't really matter
+    public static List<Target> phonyTargets(final int numTargets) {
+        return IntStream.range(0, numTargets).mapToObj(n -> {
+            final String name = String.format("Target%d", n);
+            final SimpleInterval interval = new SimpleInterval("chr", 2*n + 1, 2*n + 2);
+            return new Target(name, interval);
+        }).collect(Collectors.toList());
+    }
+}
diff --git a/...test/java/org/broadinstitute/hellbender/tools/exome/CombineReadCountsIntegrationTest.java b/...test/java/org/broadinstitute/hellbender/tools/exome/CombineReadCountsIntegrationTest.java
@@ -4,6 +4,7 @@
 import org.broadinstitute.hellbender.CommandLineProgramTest;
 import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
 import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.fakedata.SimulatedTargets;
 import org.broadinstitute.hellbender.utils.SimpleInterval;
 import org.broadinstitute.hellbender.utils.test.BaseTest;
 import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
@@ -45,12 +46,7 @@ public class CombineReadCountsIntegrationTest extends CommandLineProgramTest {
     @Test(expectedExceptions = UserException.class)
     public void testEmptyInputFileListProvided() throws Exception {
         @SuppressWarnings("serial")
-        final List<Target> phonyTargets = new ArrayList<Target>() {{
-            add(new Target("target_0",new SimpleInterval("1", 100, 200)));
-            add(new Target("target_1",new SimpleInterval("2", 100, 200)));
-            add(new Target("target_2",new SimpleInterval("3", 100, 200)));
-
-        }};
+        final List<Target> phonyTargets = SimulatedTargets.phonyTargets(3);
         final List<File> inputFiles = Collections.emptyList();
         final File targetFile = createTargetFile(phonyTargets);
         final File inputListFile = createInputListFile(inputFiles);
@@ -67,12 +63,7 @@ public void testEmptyInputFileListProvided() throws Exception {
     @Test(expectedExceptions = UserException.class)
     public void testNoInputFileProvided() throws Exception {
         @SuppressWarnings("serial")
-        final List<Target> phonyTargets = new ArrayList<Target>() {{
-            add(new Target("target_0",new SimpleInterval("1", 100, 200)));
-            add(new Target("target_1",new SimpleInterval("2", 100, 200)));
-            add(new Target("target_2",new SimpleInterval("3", 100, 200)));
-
-        }};
+        final List<Target> phonyTargets = SimulatedTargets.phonyTargets(3);
         final List<File> inputFiles = Collections.emptyList();
         final File targetFile = createTargetFile(phonyTargets);
         try {