From cd15f1e3abb61e18906b71b69bac45c80b8ed9ec Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 27 Nov 2024 17:21:05 +0100 Subject: [PATCH] add support of mibig v4.0 (#286) [MIBiG v4.0](https://mibig.secondarymetabolites.org/download) is released recently. This PR adds support for it. --- .../genomics/mibig/mibig_downloader.py | 7 +- src/nplinker/genomics/mibig/mibig_metadata.py | 33 +- tests/unit/data/mibig/BGC0000001_v4.0.json | 603 ++++++++++++++++++ tests/unit/genomics/test_mibig_downloader.py | 27 +- tests/unit/genomics/test_mibig_metadata.py | 30 +- 5 files changed, 654 insertions(+), 46 deletions(-) create mode 100644 tests/unit/data/mibig/BGC0000001_v4.0.json diff --git a/src/nplinker/genomics/mibig/mibig_downloader.py b/src/nplinker/genomics/mibig/mibig_downloader.py index 9b7d1d530..f063a7cb7 100644 --- a/src/nplinker/genomics/mibig/mibig_downloader.py +++ b/src/nplinker/genomics/mibig/mibig_downloader.py @@ -21,6 +21,7 @@ "2.0": "843ce4677db6d11422f0e6d94dd03e81", "3.0": "7c38b90f939086c03392d99a913baef9", "3.1": "643d1349722a9437d8dcf558dac5f815", + "4.0": "70d1e7d573652ba62548b1fcfbdbf844", } @@ -31,6 +32,8 @@ def download_and_extract_mibig_metadata( ): """Download and extract MIBiG metadata json files. + The MIBiG metadata json files are available at https://mibig.secondarymetabolites.org/download. + Note that it does not matter whether the metadata json files are in nested folders or not in the archive, all json files will be extracted to the same location, i.e. `extract_path`. The nested folders will be removed if they exist. So the `extract_path` will have only json files. @@ -39,7 +42,7 @@ def download_and_extract_mibig_metadata( download_root: Path to the directory in which to place the downloaded archive. extract_path: Path to an empty directory where the json files will be extracted. The directory must be empty if it exists. If it doesn't exist, the directory will be created. - version: _description_. Defaults to "3.1". + version: MIBiG version. Defaults to "3.1". Examples: >>> download_and_extract_mibig_metadata("/data/download", "/data/mibig_metadata") @@ -58,7 +61,7 @@ def download_and_extract_mibig_metadata( raise ValueError(f'Nonempty directory: "{extract_path}"') # download and extract - md5 = _MD5_MIBIG_METADATA[version] + md5 = _MD5_MIBIG_METADATA.get(version, None) download_and_extract_archive( url=MIBIG_METADATA_URL.format(version=version), download_root=download_root, diff --git a/src/nplinker/genomics/mibig/mibig_metadata.py b/src/nplinker/genomics/mibig/mibig_metadata.py index 84c2ae336..a8e06fa16 100644 --- a/src/nplinker/genomics/mibig/mibig_metadata.py +++ b/src/nplinker/genomics/mibig/mibig_metadata.py @@ -9,6 +9,8 @@ class MibigMetadata: MIBiG is a specification of BGC metadata and use JSON schema to represent BGC metadata. More details see: https://mibig.secondarymetabolites.org/download. + + This class supports MIBiG version 1.0 to 4.0. """ def __init__(self, file: str | PathLike) -> None: @@ -37,22 +39,37 @@ def mibig_accession(self) -> str: def biosyn_class(self) -> tuple[str]: """Get the value of metadata item 'biosyn_class'. - The 'biosyn_class' is biosynthetic class(es), namely the type of - natural product or secondary metabolite. + The 'biosyn_class' is biosynthetic class(es) defined by MIBiG. - MIBiG defines 6 major biosynthetic classes for natural products, + Before version 4.0 of MIBiG, it defines 6 major biosynthetic classes, including `NRP`, `Polyketide`, `RiPP`, `Terpene`, `Saccharide` - and `Alkaloid`. Note that natural products created by the other - biosynthetic mechanisms fall under the category `Other`. For more details - see [the paper](https://doi.org/10.1186/s40793-018-0318-y). + and `Alkaloid`. + + Starting from version 4.0, MIBiG defines 5 major biosynthetic classes, + including `PKS`, `NRPS`, `Ribosomal`, `Terpene` and `Saccharide`. + + The mapping between the old and new classes is as follows: + + - `NRP` -> `NRPS` + - `Polyketide` -> `PKS` + - `RiPP` -> `Ribosomal` + - `Terpene` -> `Terpene` + - `Saccharide` -> `Saccharide` + - `Alkaloid` -> `Other` + + Note that natural products that do not fit into any of the above + biosynthetic classes fall under the category `Other`. """ return self._biosyn_class def _parse_metadata(self) -> None: """Parse metadata to get 'mibig_accession' and 'biosyn_class' values.""" - if "general_params" in self.metadata: + if "general_params" in self.metadata: # version ≤1.4 self._mibig_accession = self.metadata["general_params"]["mibig_accession"] self._biosyn_class = tuple(self.metadata["general_params"]["biosyn_class"]) - else: # version≥2.0 + elif "cluster" in self.metadata: # version ≥2.0 and <4.0 self._mibig_accession = self.metadata["cluster"]["mibig_accession"] self._biosyn_class = tuple(self.metadata["cluster"]["biosyn_class"]) + elif "version" in self.metadata: # version≥4.0 + self._mibig_accession = self.metadata["accession"] + self._biosyn_class = tuple(i["class"] for i in self.metadata["biosynthesis"]["classes"]) diff --git a/tests/unit/data/mibig/BGC0000001_v4.0.json b/tests/unit/data/mibig/BGC0000001_v4.0.json new file mode 100644 index 000000000..958f613a8 --- /dev/null +++ b/tests/unit/data/mibig/BGC0000001_v4.0.json @@ -0,0 +1,603 @@ +{ + "accession": "BGC0000001", + "version": 5, + "changelog": { + "releases": [ + { + "version": "1", + "entries": [ + { + "contributors": [ + "M2JBIQNJAZIP5YVYS7CZLALR" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2015-06-12", + "comment": "Submitted" + } + ], + "date": "2015-06-12" + }, + { + "version": "2", + "entries": [ + { + "contributors": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2019-10-16", + "comment": "Migrated from v1.4" + }, + { + "contributors": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2019-10-16", + "comment": "Updated compound(s) information (MIBiG Annotathon)" + }, + { + "contributors": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2019-10-16", + "comment": "Updated compound(s) information (NPAtlas curation)" + } + ], + "date": "2019-10-16" + }, + { + "version": "3", + "entries": [ + { + "contributors": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2022-09-15", + "comment": "Removed ketoreductase stereochemistry annotation from modules without ketoreductases" + }, + { + "contributors": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2022-09-15", + "comment": "Sorted modules by module number" + } + ], + "date": "2022-09-15" + }, + { + "version": "4", + "entries": [ + { + "contributors": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2022-10-07", + "comment": "Update chemical activity to schema version 2.11" + } + ], + "date": "2022-10-07" + }, + { + "version": "5", + "entries": [ + { + "contributors": [ + "IQDGAEIXNOAWG3AVLVJH6HBZ" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2024-04-23", + "comment": "MIBiG v4 annotathon" + } + ], + "date": "2024-11-15" + } + ] + }, + "quality": "questionable", + "status": "active", + "completeness": "complete", + "loci": [ + { + "accession": "JF752342.1", + "location": { + "from": 0, + "to": 0 + }, + "evidence": [ + { + "method": "Knock-out studies" + } + ] + } + ], + "biosynthesis": { + "classes": [ + { + "class": "PKS", + "subclass": "Type I", + "cyclases": [] + } + ], + "modules": [ + { + "type": "pks-modular", + "name": "1", + "genes": [ + "AEK75502.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + }, + { + "type": "pks-modular", + "name": "2", + "genes": [ + "AEK75502.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "modification_domains": [ + { + "type": "ketoreductase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + }, + { + "type": "dehydratase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + }, + { + "type": "pks-modular", + "name": "3", + "genes": [ + "AEK75502.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "modification_domains": [ + { + "type": "ketoreductase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + }, + { + "type": "dehydratase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + }, + { + "type": "pks-modular", + "name": "4", + "genes": [ + "AEK75502.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "modification_domains": [ + { + "type": "ketoreductase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + }, + { + "type": "dehydratase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + }, + { + "type": "pks-modular", + "name": "5", + "genes": [ + "AEK75503.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + } + } + }, + { + "type": "pks-modular", + "name": "6", + "genes": [ + "AEK75503.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "modification_domains": [ + { + "type": "ketoreductase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + } + }, + { + "type": "dehydratase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + } + }, + { + "type": "enoylreductase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + } + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + } + } + }, + { + "type": "pks-modular", + "name": "7", + "genes": [ + "AEK75504.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75504.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75504.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75504.1", + "location": { + "from": -1, + "to": -1 + } + } + } + ] + }, + "compounds": [ + { + "name": "abyssomicin C", + "evidence": [], + "bioactivities": [ + { + "name": "antibacterial", + "observed": true, + "references": [] + }, + { + "name": "cytotoxic", + "observed": true, + "references": [] + } + ], + "structure": "CC1C[C@]23OC(=O)C4=C2OC1C(O)C3\\C=C/C(=O)[C@@H](C)C[C@@H](C)C4=O", + "databaseIds": [ + "npatlas:NPA01961", + "pubchem:71455791", + "chembl:CHEMBL1651097" + ], + "moieties": [ + "Tetronate" + ], + "mass": 346.141638424, + "formula": "C19H22O6" + }, + { + "name": "atrop-abyssomicin C", + "evidence": [], + "bioactivities": [ + { + "name": "antibacterial", + "observed": true, + "references": [] + }, + { + "name": "cytotoxic", + "observed": true, + "references": [] + } + ], + "structure": "CC1CC23OC(=O)C4=C2OC1C(O)C3\\C=C/C(=O)C(C)CC(C)C4=O", + "databaseIds": [ + "npatlas:NPA018239", + "chemspider:19955692" + ], + "moieties": [ + "Tetronate" + ], + "mass": 346.141638424, + "formula": "C19H22O6" + } + ], + "taxonomy": { + "name": "Verrucosispora maris AB-18-032", + "ncbiTaxId": 263358 + }, + "genes": { + "to_add": [ + { + "id": "abyU", + "location": { + "exons": [ + { + "from": 7760, + "to": 8182 + } + ], + "strand": -1 + }, + "translation": "MTERLETRPQALLIKVPTEIVVKVVDDVDVAAPAVGQVGKFDDELYDEAGAQIGTSSGNFRIEYVRPTDGGLLTYYQEDITLSDGVIHAEGWADFNDVRTSKWVFYPATGVSGRYLGLTGFRQWRMTGVRKSAEARILLGE" + } + ], + "annotations": [ + { + "id": "AEK75497.1", + "name": "abyA1", + "product": "3-oxoacyl-ACP synthase III" + }, + { + "id": "AEK75498.1", + "name": "abyA2", + "product": "phosphatase and glyceryl transferase" + }, + { + "id": "AEK75499.1", + "name": "abyA3", + "product": "acyl-carrier protein" + }, + { + "id": "AEK75500.1", + "name": "abyA4", + "product": "pyruvate/2-oxoglutarate dehydrogenase" + }, + { + "id": "abyU", + "name": "abyU", + "product": "Diels-Alderase" + }, + { + "id": "AEK75512.1", + "name": "abyV", + "product": "cytochrome P450" + } + ] + }, + "legacy_references": [ + "pubmed:21656887" + ] +} \ No newline at end of file diff --git a/tests/unit/genomics/test_mibig_downloader.py b/tests/unit/genomics/test_mibig_downloader.py index 8dfdb2646..a83e30e2e 100644 --- a/tests/unit/genomics/test_mibig_downloader.py +++ b/tests/unit/genomics/test_mibig_downloader.py @@ -3,26 +3,21 @@ class TestDownloadAndExtractMibigMetadata: - def test_default(self, tmp_path): + @pytest.mark.parametrize( + "version, expected", + [ + ["1.4", "mibig_json_1.4.tar.gz"], + ["3.1", "mibig_json_3.1.tar.gz"], + ["4.0", "mibig_json_4.0.tar.gz"], + ], + ) + def test_version(self, tmp_path, version, expected): download_path = tmp_path / "download" extract_path = tmp_path / "metadata" download_path.mkdir() extract_path.mkdir() - mibig.download_and_extract_mibig_metadata(download_path, extract_path) - archive = download_path / "mibig_json_3.1.tar.gz" - metadata = extract_path / "BGC0000002.json" - assert archive.exists() - assert archive.is_file() - assert metadata.exists() - assert metadata.is_file() - - def test_version(self, tmp_path): - download_path = tmp_path / "download" - extract_path = tmp_path / "metadata" - download_path.mkdir() - extract_path.mkdir() - mibig.download_and_extract_mibig_metadata(download_path, extract_path, version="1.4") - archive = download_path / "mibig_json_1.4.tar.gz" + mibig.download_and_extract_mibig_metadata(download_path, extract_path, version=version) + archive = download_path / expected metadata = extract_path / "BGC0000002.json" assert archive.exists() assert archive.is_file() diff --git a/tests/unit/genomics/test_mibig_metadata.py b/tests/unit/genomics/test_mibig_metadata.py index 738db8d31..f837078ac 100644 --- a/tests/unit/genomics/test_mibig_metadata.py +++ b/tests/unit/genomics/test_mibig_metadata.py @@ -3,23 +3,13 @@ from .. import DATA_DIR -@pytest.mark.parametrize("version", ["v1.4", "v3.1"]) -class TestMibigMetadata: - @pytest.fixture - def json_file(self, version): - json_file = DATA_DIR / "mibig" / f"BGC0000001_{version}.json" - yield str(json_file) - - @pytest.fixture - def metadata(self, json_file): - yield MibigMetadata(json_file) - - def test_init(self, metadata, json_file): - assert metadata.file == json_file - assert isinstance(metadata.metadata, dict) - - def test_mibig_accession(self, metadata): - assert metadata.mibig_accession == "BGC0000001" - - def test_biosyn_class(self, metadata): - assert metadata.biosyn_class == ("Polyketide",) +@pytest.mark.parametrize( + "version, expected", [["v1.4", "Polyketide"], ["v3.1", "Polyketide"], ["v4.0", "PKS"]] +) +def test_versions(version, expected): + json_file = DATA_DIR / "mibig" / f"BGC0000001_{version}.json" + metadata = MibigMetadata(str(json_file)) + assert metadata.file == str(json_file) + assert isinstance(metadata.metadata, dict) + assert metadata.mibig_accession == "BGC0000001" + assert metadata.biosyn_class == (expected,)