Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update dependencies and build new data release #236

Merged
merged 2 commits into from
Nov 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,13 @@

<phenol.version>2.0.0</phenol.version>
<phenopacket-tools.version>1.0.0-RC2</phenopacket-tools.version>
<phenopacket-schema.version>2.0.2</phenopacket-schema.version>
<silent.genes.version>0.2.5</silent.genes.version>
<htsjdk.version>3.0.5</htsjdk.version>
<commons-csv.version>1.7</commons-csv.version>
<commons-io.version>2.8.0</commons-io.version>
<commons-net.version>3.8.0</commons-net.version>
<commons-compress.version>1.24.0</commons-compress.version>
<h2.version>1.4.200</h2.version>
</properties>

Expand Down Expand Up @@ -111,7 +117,7 @@
<dependency>
<groupId>org.phenopackets</groupId>
<artifactId>phenopacket-schema</artifactId>
<version>2.0.2</version>
<version>${phenopacket-schema.version}</version>
</dependency>
<dependency>
<groupId>com.google.protobuf</groupId>
Expand All @@ -131,7 +137,7 @@
<dependency>
<groupId>com.github.samtools</groupId>
<artifactId>htsjdk</artifactId>
<version>3.0.5</version>
<version>${htsjdk.version}</version>
<exclusions>
<exclusion>
<groupId>org.tukaani</groupId>
Expand All @@ -143,22 +149,22 @@
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.7</version>
<version>${commons-csv.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.21</version>
<version>${commons-compress.version}</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.8.0</version>
<version>${commons-io.version}</version>
</dependency>
<dependency>
<groupId>commons-net</groupId>
<artifactId>commons-net</artifactId>
<version>3.8.0</version>
<version>${commons-net.version}</version>
</dependency>
<dependency>
<groupId>info.picocli</groupId>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.monarchinitiative.svanna.benchmark.cmd.benchmark_case;

import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.monarchinitiative.svanna.benchmark.cmd.BaseBenchmarkCommand;
Expand Down Expand Up @@ -34,6 +33,7 @@
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.zip.GZIPOutputStream;

@CommandLine.Command(name = "benchmark-case",
aliases = {"BC"},
Expand Down Expand Up @@ -191,7 +191,7 @@ private void writeOutResults(File output, BenchmarkResults results, Set<String>

// "case_name", "background_vcf", "variant_id", "rank", "vtype", "is_causal", "priority"
LOGGER.info("Writing the results for `{}`", results.caseName());
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new GzipCompressorOutputStream(new FileOutputStream(output))))) {
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(output))))) {
CSVPrinter printer = CSVFormat.DEFAULT
.withHeader("case_name", "background_vcf", "variant_id", "rank", "vtype", "is_causal", "priority")
.print(writer);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.monarchinitiative.svanna.cli.writer.tabular;

import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVPrinter;
import org.monarchinitiative.svanna.cli.writer.AnalysisResults;
Expand All @@ -27,6 +26,7 @@
import java.util.LinkedList;
import java.util.List;
import java.util.function.Consumer;
import java.util.zip.GZIPOutputStream;

public class TabularResultWriter implements ResultWriter {

Expand Down Expand Up @@ -63,7 +63,7 @@ private BufferedWriter openWriter(Path output, String prefix) throws IOException
Path outPath = output.resolve(prefix + suffix + (compress ? ".gz" : ""));
LogUtils.logInfo(LOGGER, "Writing tabular results into {}", outPath.toAbsolutePath());
return compress
? new BufferedWriter(new OutputStreamWriter(new GzipCompressorOutputStream(new FileOutputStream(outPath.toFile()))))
? new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(outPath.toFile()))))
: Files.newBufferedWriter(outPath);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import org.apache.commons.codec.digest.MessageDigestAlgorithms;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipFile;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorOutputStream;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
Expand Down Expand Up @@ -87,6 +85,8 @@
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

@CommandLine.Command(name = "build-db",
aliases = "B",
Expand Down Expand Up @@ -369,7 +369,7 @@ private static List<? extends GencodeGene> downloadAndPreprocessGenes(GeneProper
GeneParser jsonParser = parserFactory.forFormat(SerializationFormat.JSON);
Path destination = buildDir.resolve("gencode.v38.genes.json.gz");
LOGGER.info("Serializing the genes to {}", destination.toAbsolutePath());
try (OutputStream os = new BufferedOutputStream(new GzipCompressorOutputStream(Files.newOutputStream(destination)))) {
try (OutputStream os = new BufferedOutputStream(new GZIPOutputStream(Files.newOutputStream(destination)))) {
jsonParser.write(genes, os);
}

Expand Down Expand Up @@ -563,7 +563,7 @@ private static Map<Integer, Integer> parseNcbiToHgncTable(String ncbiGeneToHgnc)

private static BufferedReader openForReading(Path tablePath) throws IOException {
return (tablePath.toFile().getName().endsWith(".gz"))
? new BufferedReader(new InputStreamReader(new GzipCompressorInputStream(Files.newInputStream(tablePath))))
? new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(tablePath))))
: Files.newBufferedReader(tablePath);

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderVersion;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.monarchinitiative.svanna.core.LogUtils;
import org.monarchinitiative.svanna.core.filter.FilterResult;
import org.monarchinitiative.svanna.core.filter.FilterType;
Expand All @@ -23,15 +22,16 @@
import org.slf4j.LoggerFactory;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Stream;
import java.util.zip.GZIPInputStream;

/**
* Parse variants stored in a VCF file. The parser is <em>NOT</em> thread safe!
Expand Down Expand Up @@ -97,20 +97,39 @@ public Stream<FullSvannaVariant> createVariantAlleles(Path filePath) throws IOEx
VCFCodec codec = new VCFCodec();
codec.setVCFHeader(header, header.getVCFHeaderVersion() == null ? VCFHeaderVersion.VCF4_1 : header.getVCFHeaderVersion());

BufferedReader reader;
if (filePath.toFile().getName().endsWith(".gz"))
reader = new BufferedReader(new InputStreamReader(new GzipCompressorInputStream(new FileInputStream(filePath.toFile()))));
else
reader = Files.newBufferedReader(filePath);
BufferedReader reader = openFileForReading(filePath);

return reader.lines()
.onClose(() -> {try {reader.close();} catch (IOException ignored) {}})
.onClose(closeReader(reader))
.map(toVariantContext(codec))
.flatMap(Optional::stream)
.map(toVariants())
.flatMap(Optional::stream);
}

private static BufferedReader openFileForReading(Path filePath) throws IOException {
BufferedReader reader;
if (filePath.toFile().getName().endsWith(".gz"))
reader = new BufferedReader(
new InputStreamReader(
new GZIPInputStream(Files.newInputStream(filePath)),
StandardCharsets.UTF_8));
else
reader = Files.newBufferedReader(filePath, StandardCharsets.UTF_8);
return reader;
}

private static Runnable closeReader(BufferedReader reader) {
return () -> {
try {
LOGGER.trace("Closing VCF file");
reader.close();
} catch (IOException e) {
LOGGER.warn("Error while closing the VCF file", e);
}
};
}

/**
* One variant context might represent multiple sequence variants or a single symbolic variant/breakend.
* This function melts the variant context to a collection of variants.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.monarchinitiative.svanna.io.service;

import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.monarchinitiative.svanna.core.service.GeneService;
import org.monarchinitiative.svanna.core.service.QueryResult;
import org.monarchinitiative.svanna.io.service.jannovar.IntervalArray;
Expand All @@ -23,6 +22,7 @@
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;

public class SilentGenesGeneService implements GeneService {

Expand Down Expand Up @@ -64,7 +64,7 @@ public static SilentGenesGeneService of(GenomicAssembly assembly, Path silentGen
private static InputStream openForReading(Path silentGenesJsonPath) throws IOException {
if (silentGenesJsonPath.toFile().getName().endsWith(".gz")) {
LOGGER.debug("Assuming the file is gzipped");
return new BufferedInputStream(new GzipCompressorInputStream(Files.newInputStream(silentGenesJsonPath)));
return new BufferedInputStream(new GZIPInputStream(Files.newInputStream(silentGenesJsonPath)));
} else {
return new BufferedInputStream(Files.newInputStream(silentGenesJsonPath));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,12 @@
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFHeaderVersion;
import org.junit.jupiter.api.*;
import org.monarchinitiative.svanna.core.reference.SvannaVariant;
import org.monarchinitiative.svanna.core.reference.VariantAware;
import org.monarchinitiative.svanna.core.reference.Zygosity;
import org.monarchinitiative.svanna.io.FullSvannaVariant;
import org.monarchinitiative.svanna.io.TestDataConfig;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
import org.monarchinitiative.svart.*;
import org.monarchinitiative.svart.assembly.GenomicAssembly;
import org.monarchinitiative.svart.assembly.GenomicAssemblies;
Expand All @@ -31,7 +28,8 @@
@SpringBootTest(classes = TestDataConfig.class)
public class VcfVariantParserTest {

private static final Path SV_EXAMPLE_PATH = Paths.get("src/test/resources/org/monarchinitiative/svanna/io/parse/sv_example.vcf");
private static final Path TEST_VCF_DIR = Paths.get("src/test/resources/org/monarchinitiative/svanna/io/parse");
private static final Path SV_EXAMPLE_PATH = TEST_VCF_DIR.resolve("sv_example.vcf");
private static final VCFCodec VCF_CODEC = new VCFCodec();

@BeforeAll
Expand Down Expand Up @@ -426,6 +424,41 @@ public void toVariants_breakendVariant() {
}
}

/**
* Per issue <a href="https://github.com/TheJacksonLaboratory/SvAnna/issues/235">235</a>,
* HTSlib &gt;1.17 produces a gzipped file that cannot be read by common-compress's `GzipCompressorInputStream`.
* As a fix, the class was replaced by JRE's {@link java.util.zip.GZIPInputStream}.
* <p>
* Here we test that both older and newer VCFs can be correctly read by SvAnna's code.
*/
@Nested
public class GzipQuirkTests {

private final GenomicAssembly GRCh38p13 = GenomicAssemblies.GRCh38p13();
private VcfVariantParser instance;

@BeforeEach
public void setUp() {
instance = new VcfVariantParser(GRCh38p13);
}

@Test
public void loadHtslibLeq16() throws Exception {
Path input = TEST_VCF_DIR.resolve("htslib_16.vcf.gz");
List<FullSvannaVariant> alleles = instance.createVariantAlleleList(input);

assertThat(alleles, hasSize(8));
}

@Test
public void loadHtslibGeq17() throws Exception {
Path input = TEST_VCF_DIR.resolve("htslib_17.vcf.gz");
List<FullSvannaVariant> alleles = instance.createVariantAlleleList(input);

assertThat(alleles, hasSize(8));
}
}

private static GenomicAssembly testAssembly(List<Contig> contigs) {
return GenomicAssembly.of("toy", "Wookie", "9999", "Han Solo", "2100-01-01",
"GB1", "RS1", contigs);
Expand Down
Binary file not shown.
Binary file not shown.
Loading