From 440016d7b8054d0bedf97e3d44d03f6fd52e4fd9 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Tue, 24 May 2016 14:43:54 -0400 Subject: [PATCH 1/2] GC bias correction address code review --- .../exome/NormalizeSomaticReadCounts.java | 4 +- .../exome/TargetAnnotationCollection.java | 2 +- .../tools/exome/gcbias/CorrectGCBias.java | 95 +++++++++++++ .../tools/exome/gcbias/GCCorrector.java | 125 ++++++++++++++++++ .../fakedata/GCBiasSimulatedData.java | 77 +++++++++++ .../hellbender/fakedata/SimulatedSamples.java | 15 +++ .../hellbender/fakedata/SimulatedTargets.java | 23 ++++ .../CombineReadCountsIntegrationTest.java | 15 +-- .../gcbias/CorrectGCBiasIntegrationTest.java | 65 +++++++++ .../exome/gcbias/GCCorrectorUnitTest.java | 36 +++++ 10 files changed, 442 insertions(+), 15 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBias.java create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/GCCorrector.java create mode 100644 src/test/java/org/broadinstitute/hellbender/fakedata/GCBiasSimulatedData.java create mode 100644 src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedSamples.java create mode 100644 src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedTargets.java create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBiasIntegrationTest.java create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/exome/gcbias/GCCorrectorUnitTest.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/exome/NormalizeSomaticReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/exome/NormalizeSomaticReadCounts.java index 7f1bacab6..833126aa8 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/exome/NormalizeSomaticReadCounts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/exome/NormalizeSomaticReadCounts.java @@ -214,8 +214,8 @@ private TargetCollection readTargetCollection(final File t * @param the element type of {@code exonCollection}, it must be a {@link BEDFeature}. * @return never {@code null}. * @throws UserException.CouldNotReadInputFile if there was some problem - * trying to read from {@code readCountsFile}. - * @throws UserException.BadInput if there is some format issue with the input file {@code readCountsFile}, + * trying to read from {@code inputReadCountsFile}. + * @throws UserException.BadInput if there is some format issue with the input file {@code inputReadCountsFile}, * or it does not contain target names and {@code exonCollection} is {@code null} * or the input read counts contain more thanb one sample. */ diff --git a/src/main/java/org/broadinstitute/hellbender/tools/exome/TargetAnnotationCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/exome/TargetAnnotationCollection.java index 2ee53dd62..7ca2b203c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/exome/TargetAnnotationCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/exome/TargetAnnotationCollection.java @@ -9,7 +9,7 @@ * * @author Valentin Ruano-Rubio <valentin@broadinstitute.org> */ -final class TargetAnnotationCollection { +public final class TargetAnnotationCollection { private final EnumMap values = new EnumMap(TargetAnnotation.class); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBias.java b/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBias.java new file mode 100644 index 000000000..c7997c123 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBias.java @@ -0,0 +1,95 @@ +package org.broadinstitute.hellbender.tools.exome.gcbias; + +import org.broadinstitute.hellbender.cmdline.*; +import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.exome.*; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; + +/** + * Correct coverage for sample-specific GC bias effects as described in {@link GCCorrector}. + * + * Inputs are + * 1. read counts file (format described in {@link ReadCountCollectionUtils}). Since this tool corrects + * for multiplicative GC bias effects, these counts should not be in log space i.e. they should represent + * raw coverage or proportional coverage upstream of tangent normalization. + * 2. targets file containing GC content annotation as produced by {@link AnnotateTargets}. Every target in the input + * read counts must be present in the targets file but the reverse need not be true. + * + * Output is a read counts file with the same targets (rows) and samples (columns) as the input, corrected for GC bias. + * @author David Benjamin <davidben@broadinstitute.org> + */ +@CommandLineProgramProperties( + oneLineSummary = "Correct for per-sample GC bias effects.", + summary = "Correct for per-sample GC bias effects.", + programGroup = CopyNumberProgramGroup.class +) +public class CorrectGCBias extends CommandLineProgram { + public static final String INPUT_READ_COUNTS_FILE_LONG_NAME = StandardArgumentDefinitions.INPUT_LONG_NAME; + public static final String INPUT_READ_COUNTS_FILE_SHORT_NAME = StandardArgumentDefinitions.INPUT_SHORT_NAME; + + public static final String OUTPUT_READ_COUNTS_FILE_LONG_NAME = StandardArgumentDefinitions.OUTPUT_LONG_NAME; + public static final String OUTPUT_READ_COUNTS_FILE_SHORT_NAME = StandardArgumentDefinitions.OUTPUT_SHORT_NAME; + + @Argument( + doc = "Read counts input file.", + shortName = INPUT_READ_COUNTS_FILE_SHORT_NAME, + fullName = INPUT_READ_COUNTS_FILE_LONG_NAME, + optional = false + ) + protected File inputReadCountsFile; + + @Argument( + doc = "Output file of GC-corrected read counts.", + shortName = OUTPUT_READ_COUNTS_FILE_SHORT_NAME, + fullName = OUTPUT_READ_COUNTS_FILE_LONG_NAME, + optional = false + ) + protected File outputReadCountsFile; + + // arguments for GC-annotated targets + @ArgumentCollection + protected TargetArgumentCollection targetArguments = new TargetArgumentCollection(); + + @Override + protected Object doWork() { + final TargetCollection targets = targetArguments.readTargetCollection(false); + final ReadCountCollection inputCounts = readInputCounts(targets); + final double[] gcContentByTarget = gcContentsOfTargets(inputCounts, targets); + final ReadCountCollection outputCounts = GCCorrector.correctCoverage(inputCounts, gcContentByTarget); + writeOutputCounts(outputCounts); + return "Success"; + } + + private ReadCountCollection readInputCounts(final TargetCollection targets) { + final ReadCountCollection inputCounts; + try { + inputCounts = ReadCountCollectionUtils.parse(inputReadCountsFile, targets, false); + } catch (final IOException ex) { + throw new UserException.CouldNotReadInputFile(inputReadCountsFile, ex.getMessage(), ex); + } + return inputCounts; + } + + private void writeOutputCounts(final ReadCountCollection outputCounts) { + try { + ReadCountCollectionUtils.write(outputReadCountsFile, outputCounts); + } catch (final IOException ex) { + throw new UserException.CouldNotCreateOutputFile(outputReadCountsFile, ex); + } + } + + private double[] gcContentsOfTargets(final ReadCountCollection inputCounts, final TargetCollection targets) { + final List annotatedTargets = inputCounts.targets().stream().map(t -> targets.target(t.getName())).collect(Collectors.toList()); + if (!annotatedTargets.stream().allMatch(t -> t.getAnnotations().hasAnnotation(TargetAnnotation.GC_CONTENT))) { + throw new UserException.BadInput("At least one target lacks a GC annotation."); + } + return annotatedTargets.stream().mapToDouble(t -> t.getAnnotations().getDouble(TargetAnnotation.GC_CONTENT)).toArray(); + } + + +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/GCCorrector.java b/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/GCCorrector.java new file mode 100644 index 000000000..a7f5e0330 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/GCCorrector.java @@ -0,0 +1,125 @@ +package org.broadinstitute.hellbender.tools.exome.gcbias; + +import org.apache.commons.math3.linear.ArrayRealVector; +import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.linear.RealVector; +import org.apache.commons.math3.stat.descriptive.rank.Median; +import org.broadinstitute.hellbender.tools.exome.ReadCountCollection; +import org.broadinstitute.hellbender.utils.Utils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Learn multiplicative correction factors as a function of GC content from coverage vs. GC data. Basically, learn a + * regression curve of coverage vs. GC, and divide by that curve to get GC-corrected coverage. + * + * Our regression curve is obtained by filling GC content bins of width 0.01 with the coverages of targets corresponding to each GC + * and taking the median to get a robust estimate of the curve. In order to smooth out bins with few data (i.e. extreme + * GC values that occur rarely) we then convolve these medians with an exponential kernel. + * + * @author David Benjamin <davidben@broadinstitute.org> + */ +class GCCorrector { + // GC bins are 0%, 1% . . . 100% + private final static int NUMBER_OF_GC_BINS = 101; + + // scale (in units of GCcontent from 0 to 1) over which gc bias correlation decays + // i.e. the bias at GC content = 0.3 and at 0.2 are correlated ~exp(-0.1/correlationLength) + // this is effectively a smoothness parameter used for regularizing the GC bias estimate for + // GC content bins that have few targets) + private static final double correlationLength = 0.02; + private static final double correlationDecayRatePerBin = 1.0 / (correlationLength * NUMBER_OF_GC_BINS); + + // multiply by these to get a GC correction as a function of GC + private final double[] gcCorrectionFactors; + + //Apache commons median doesn't work on empty arrays; this value is a placeholder to avoid exceptions + private static final double DUMMY_VALUE_NEVER_USED = 1.0; + + /** + * Learn multiplicative correction factors as a function of GC from coverage vs. GC data. Basically, learn a + * regression curve of coverage vs. GC in order to divide by that curve later. + * + * @param gcContents GC content (from 0.0 to 1.0) of targets in {@code coverage} + * @param coverage raw of proportional coverage + */ + public GCCorrector(final double[] gcContents, final RealVector coverage) { + Utils.nonNull(gcContents); + Utils.nonNull(coverage); + Utils.validateArg(gcContents.length > 0, "must have at lest one datum"); + Utils.validateArg(gcContents.length == coverage.getDimension(), "must have one gc value per coverage."); + + final List> coveragesByGC = new ArrayList<>(NUMBER_OF_GC_BINS); + IntStream.range(0, NUMBER_OF_GC_BINS).forEach(n -> coveragesByGC.add(new ArrayList<>())); + IntStream.range(0, gcContents.length).forEach(n -> coveragesByGC.get(gcContentToBinIndex(gcContents[n])).add(coverage.getEntry(n))); + gcCorrectionFactors = calculateCorrectionFactors(coveragesByGC); + } + + /** + * As described above, calculate medians of each GC bin and convolve with an exponential kernel. + * + * @param coveragesByGC list of coverages for each GC bin + * @return multiplicative correction factors for each GC bin + */ + private double[] calculateCorrectionFactors(final List> coveragesByGC) { + final RealVector medians = new ArrayRealVector(coveragesByGC.stream().mapToDouble(GCCorrector::medianOrDefault).toArray()); + return IntStream.range(0, NUMBER_OF_GC_BINS).mapToDouble(bin -> { + final RealVector weights = new ArrayRealVector(IntStream.range(0, NUMBER_OF_GC_BINS) + .mapToDouble(n -> coveragesByGC.get(n).size() * Math.exp(-Math.abs(bin - n) * correlationDecayRatePerBin)).toArray()); + return weights.dotProduct(medians) / weights.getL1Norm(); + }).map(x -> 1/x).toArray(); + } + + /** + * + * @param inputCounts raw coverage before GC correction + * @param gcContentByTarget array of gc contents, one per target of the input + * @return GC-corrected coverage + */ + public static ReadCountCollection correctCoverage(final ReadCountCollection inputCounts, final double[] gcContentByTarget) { + // each column (sample) has its own GC bias curve, hence its own GC corrector + final List gcCorrectors = IntStream.range(0, inputCounts.columnNames().size()) + .mapToObj(n -> new GCCorrector(gcContentByTarget, inputCounts.counts().getColumnVector(n))).collect(Collectors.toList()); + + // gc correct a copy of the input counts in-place + final RealMatrix correctedCounts = inputCounts.counts().copy(); + correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { + @Override + public double visit(int target, int column, double coverage) { + return gcCorrectors.get(column).correctedCoverage(coverage, gcContentByTarget[target]); + } + }); + + // we would like the average correction factor to be 1.0 in the sense that average coverage before and after + // correction should be equal + final double[] columnNormalizationFactors = IntStream.range(0, inputCounts.columnNames().size()) + .mapToDouble(c -> inputCounts.counts().getColumnVector(c).getL1Norm() / correctedCounts.getColumnVector(c).getL1Norm()).toArray(); + correctedCounts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { + @Override + public double visit(int target, int column, double coverage) { + return coverage * columnNormalizationFactors[column]; + } + }); + + return new ReadCountCollection(inputCounts.targets(), inputCounts.columnNames(), correctedCounts); + } + + private double correctedCoverage(final double coverage, final double gcContent) { + return gcCorrectionFactors[gcContentToBinIndex(gcContent)] * coverage; + } + + // return a median of coverages or dummy default value if no coverage exists at this gc bin + // this default is never used because empty bins get zero weight in {@code calculateCorrectionFactors} + private static double medianOrDefault(final List list) { + return list.size() > 0 ? new Median().evaluate(list.stream().mapToDouble(d->d).toArray()) : DUMMY_VALUE_NEVER_USED; + } + + private static int gcContentToBinIndex(final double gcContent) { + return (int) Math.round(gcContent * (NUMBER_OF_GC_BINS-1)); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/fakedata/GCBiasSimulatedData.java b/src/test/java/org/broadinstitute/hellbender/fakedata/GCBiasSimulatedData.java new file mode 100644 index 000000000..ffd4af7a2 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/fakedata/GCBiasSimulatedData.java @@ -0,0 +1,77 @@ +package org.broadinstitute.hellbender.fakedata; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.math3.linear.Array2DRowRealMatrix; +import org.apache.commons.math3.linear.DefaultRealMatrixChangingVisitor; +import org.apache.commons.math3.linear.RealMatrix; +import org.broadinstitute.hellbender.tools.exome.*; +import org.broadinstitute.hellbender.utils.SimpleInterval; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.Random; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * GC bias data consists of a {@link ReadCountCollection} and an array of gc contents + * in order of the read counts' targets. + * + * @author David Benjamin <davidben@broadinstitute.org> + */ +public class GCBiasSimulatedData { + + // a reasonable default GC bias curve + private static Function QUADRATIC_GC_BIAS_CURVE = gc -> 0.5 + 2*gc*(1-gc); + + // visible for the integration test + public static Pair simulatedData(final int numTargets, final int numSamples) { + final List phonyTargets = SimulatedTargets.phonyTargets(numTargets); + final List phonySamples = SimulatedSamples.phonySamples(numSamples); + + final Random random = new Random(13); + final double[] gcContentByTarget = IntStream.range(0, numTargets) + .mapToDouble(n -> 0.5 + 0.2*random.nextGaussian()) + .map(x -> Math.min(x,0.95)).map(x -> Math.max(x,0.05)).toArray(); + final double[] gcBiasByTarget = Arrays.stream(gcContentByTarget).map(QUADRATIC_GC_BIAS_CURVE::apply).toArray(); + + // model mainly GC bias with a small random amount of non-GC bias + // thus noise after GC correction should be nearly zero + final RealMatrix counts = new Array2DRowRealMatrix(numTargets, numSamples); + counts.walkInOptimizedOrder(new DefaultRealMatrixChangingVisitor() { + @Override + public double visit(final int target, final int column, final double value) { + return gcBiasByTarget[target]*(1.0 + 0.01*random.nextDouble()); + } + }); + final ReadCountCollection rcc = new ReadCountCollection(phonyTargets, phonySamples, counts); + return new ImmutablePair<>(rcc, gcContentByTarget); + } + + /** + * + * @param readCountsFile A simulated read counts file with GC bias effects + * @param targetsFile A simulated targets file with GC content annotation + */ + public static void makeGCBiasInputFiles(final Pair data, + final File readCountsFile, final File targetsFile) throws IOException { + final ReadCountCollection inputCounts = data.getLeft(); + final double[] gcContentByTarget = data.getRight(); + ReadCountCollectionUtils.write(readCountsFile, inputCounts); + + final TargetTableWriter writer = new TargetTableWriter(targetsFile, Collections.singleton(TargetAnnotation.GC_CONTENT)); + for (int i = 0; i < gcContentByTarget.length; i++) { + final Target unannotatedTarget = inputCounts.records().get(i).getTarget(); + final TargetAnnotationCollection annotations = new TargetAnnotationCollection(); + annotations.put(TargetAnnotation.GC_CONTENT, Double.toString(gcContentByTarget[i])); + final Target target = new Target(unannotatedTarget.getName(), unannotatedTarget.getInterval(), annotations); + writer.writeRecord(target); + } + writer.close(); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedSamples.java b/src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedSamples.java new file mode 100644 index 000000000..1f0bc6e3c --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedSamples.java @@ -0,0 +1,15 @@ +package org.broadinstitute.hellbender.fakedata; + +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * @author David Benjamin <davidben@broadinstitute.org> + */ +public class SimulatedSamples { + public static List phonySamples(final int numSamples) { + return IntStream.range(0, numSamples) + .mapToObj(n -> String.format("Sample%d", n)).collect(Collectors.toList()); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedTargets.java b/src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedTargets.java new file mode 100644 index 000000000..5e8f291bb --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/fakedata/SimulatedTargets.java @@ -0,0 +1,23 @@ +package org.broadinstitute.hellbender.fakedata; + +import org.broadinstitute.hellbender.tools.exome.Target; +import org.broadinstitute.hellbender.utils.SimpleInterval; + +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * @author David Benjamin <davidben@broadinstitute.org> + */ +public class SimulatedTargets { + + // phony targets whose intervals don't really matter + public static List phonyTargets(final int numTargets) { + return IntStream.range(0, numTargets).mapToObj(n -> { + final String name = String.format("Target%d", n); + final SimpleInterval interval = new SimpleInterval("chr", 2*n + 1, 2*n + 2); + return new Target(name, interval); + }).collect(Collectors.toList()); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/exome/CombineReadCountsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/exome/CombineReadCountsIntegrationTest.java index 6479aded2..11d0a60fa 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/exome/CombineReadCountsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/exome/CombineReadCountsIntegrationTest.java @@ -4,6 +4,7 @@ import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.fakedata.SimulatedTargets; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.test.BaseTest; import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection; @@ -45,12 +46,7 @@ public class CombineReadCountsIntegrationTest extends CommandLineProgramTest { @Test(expectedExceptions = UserException.class) public void testEmptyInputFileListProvided() throws Exception { @SuppressWarnings("serial") - final List phonyTargets = new ArrayList() {{ - add(new Target("target_0",new SimpleInterval("1", 100, 200))); - add(new Target("target_1",new SimpleInterval("2", 100, 200))); - add(new Target("target_2",new SimpleInterval("3", 100, 200))); - - }}; + final List phonyTargets = SimulatedTargets.phonyTargets(3); final List inputFiles = Collections.emptyList(); final File targetFile = createTargetFile(phonyTargets); final File inputListFile = createInputListFile(inputFiles); @@ -67,12 +63,7 @@ public void testEmptyInputFileListProvided() throws Exception { @Test(expectedExceptions = UserException.class) public void testNoInputFileProvided() throws Exception { @SuppressWarnings("serial") - final List phonyTargets = new ArrayList() {{ - add(new Target("target_0",new SimpleInterval("1", 100, 200))); - add(new Target("target_1",new SimpleInterval("2", 100, 200))); - add(new Target("target_2",new SimpleInterval("3", 100, 200))); - - }}; + final List phonyTargets = SimulatedTargets.phonyTargets(3); final List inputFiles = Collections.emptyList(); final File targetFile = createTargetFile(phonyTargets); try { diff --git a/src/test/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBiasIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBiasIntegrationTest.java new file mode 100644 index 000000000..f83043d0d --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBiasIntegrationTest.java @@ -0,0 +1,65 @@ +package org.broadinstitute.hellbender.tools.exome.gcbias; + +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.ExomeStandardArgumentDefinitions; +import org.broadinstitute.hellbender.fakedata.GCBiasSimulatedData; +import org.broadinstitute.hellbender.tools.exome.*; +import org.testng.Assert; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * @author David Benjamin <davidben@broadinstitute.org> + */ +public class CorrectGCBiasIntegrationTest extends CommandLineProgramTest { + // Test meta-parameters: + private static final int NUM_TARGETS = 10000; + private static final int NUM_SAMPLES = 10; + + private static final File TARGETS_FILE = createTempFile("targets", ".tsv"); + private static final File INPUT_COUNTS_FILE = createTempFile("input", ".tsv"); + private static final File OUTPUT_COUNTS_FILE = createTempFile("output", ".tsv"); + + double[] gcContentByTarget; + ReadCountCollection inputCounts; + + + @BeforeClass + public void createInputFiles() throws IOException { + final Pair data = GCBiasSimulatedData.simulatedData(NUM_TARGETS, NUM_SAMPLES); + inputCounts = data.getLeft(); + gcContentByTarget = data.getRight(); + GCBiasSimulatedData.makeGCBiasInputFiles(data, INPUT_COUNTS_FILE, TARGETS_FILE); + } + + // test that results match expected behavior of the backing class + @Test + public void testGCCorrection() throws IOException { + final List arguments = new ArrayList<>(); + arguments.addAll(Arrays.asList( + "-" + CorrectGCBias.INPUT_READ_COUNTS_FILE_SHORT_NAME, INPUT_COUNTS_FILE.getAbsolutePath(), + "-" + CorrectGCBias.OUTPUT_READ_COUNTS_FILE_SHORT_NAME, OUTPUT_COUNTS_FILE.getAbsolutePath(), + "-" + ExomeStandardArgumentDefinitions.TARGET_FILE_SHORT_NAME, TARGETS_FILE.getAbsolutePath())); + runCommandLine(arguments); + + final ReadCountCollection outputCounts = ReadCountCollectionUtils.parse(OUTPUT_COUNTS_FILE); + final ReadCountCollection expectedOutputCounts = GCCorrector.correctCoverage(inputCounts, gcContentByTarget); + Assert.assertEquals(outputCounts.columnNames(), inputCounts.columnNames()); + Assert.assertEquals(outputCounts.counts().subtract(expectedOutputCounts.counts()).getNorm(), 0, 1e-10); + } + + @AfterClass + public void disposeFile() throws IOException { + TARGETS_FILE.delete(); + INPUT_COUNTS_FILE.delete(); + OUTPUT_COUNTS_FILE.delete(); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/exome/gcbias/GCCorrectorUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/exome/gcbias/GCCorrectorUnitTest.java new file mode 100644 index 000000000..c98a1813a --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/exome/gcbias/GCCorrectorUnitTest.java @@ -0,0 +1,36 @@ +package org.broadinstitute.hellbender.tools.exome.gcbias; + +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.hellbender.fakedata.GCBiasSimulatedData; +import org.broadinstitute.hellbender.tools.exome.ReadCountCollection; +import org.broadinstitute.hellbender.utils.GATKProtectedMathUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * @author David Benjamin <davidben@broadinstitute.org> + */ +public class GCCorrectorUnitTest { + + @Test + public void testGCCorrection() { + final int numSamples = 5; + final int numTargets = 10000; + + final Pair data = GCBiasSimulatedData.simulatedData(numTargets, numSamples); + final ReadCountCollection rcc1 = data.getLeft(); + final double[] gcContentByTarget = data.getRight(); + final ReadCountCollection correctedCounts1 = GCCorrector.correctCoverage(rcc1, gcContentByTarget); + final double[] correctedNoiseBySample = GATKProtectedMathUtils.columnStdDevs(correctedCounts1.counts()); + Arrays.stream(correctedNoiseBySample).forEach(x -> Assert.assertTrue(x < 0.02)); + + //check that GC correction is approximately idempotent -- if you correct again, very little should happen + final ReadCountCollection correctedCounts2 = GCCorrector.correctCoverage(correctedCounts1, gcContentByTarget); + final double change1 = correctedCounts1.counts().subtract(rcc1.counts()).getFrobeniusNorm(); + final double change2 = correctedCounts2.counts().subtract(correctedCounts1.counts()).getFrobeniusNorm(); + Assert.assertTrue(change2 < change1 / 10); + } + +} \ No newline at end of file From ea0214f08f342f88ab59b0bb36e6e8087e145a30 Mon Sep 17 00:00:00 2001 From: David Benjamin Date: Thu, 2 Jun 2016 23:44:42 -0400 Subject: [PATCH 2/2] CLI summary --- .../hellbender/tools/exome/gcbias/CorrectGCBias.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBias.java b/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBias.java index c7997c123..2d59fe1e0 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBias.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/exome/gcbias/CorrectGCBias.java @@ -21,11 +21,14 @@ * read counts must be present in the targets file but the reverse need not be true. * * Output is a read counts file with the same targets (rows) and samples (columns) as the input, corrected for GC bias. + * Coverage is represented by doubles in {@link ReadCountCollection}. + * * @author David Benjamin <davidben@broadinstitute.org> */ @CommandLineProgramProperties( oneLineSummary = "Correct for per-sample GC bias effects.", - summary = "Correct for per-sample GC bias effects.", + summary = "Correct coverage in a read counts files by estimating per-sample bias as a function of target GC" + + " content and dividing input coverage by these derived bias curves.", programGroup = CopyNumberProgramGroup.class ) public class CorrectGCBias extends CommandLineProgram {