From 6a6f1705c5a8c34e57f11c35273e6306fc8e6c6e Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 14 Jun 2024 13:38:02 +0200 Subject: [PATCH] fix mypy and ruff errors (#257) * use overload for MetcalfScoring * add networkx stub file * fix mypy errors fix code or ignore type checking for some nonsense mypy errors * fix ruff check errors for refactored code * run ruff format * fix imports * uniform the use of TYPE_CHECKING to only avoid circular imports This ensures that the type hints are available both during type checking and at runtime, improving code clarity and reducing the chance of runtime errors related to type hints. * fix non-existing attribute bug * fix typos * use broader type hints Sequence and Mapping use broader type hints Sequence and Mapping to replace list and dict, respectively * change `datas` to `data` * use specific types for return of abstract method when possible use more general type when necessary --- README.dev.md | 2 +- pyproject.toml | 1 + src/nplinker/arranger.py | 4 +- src/nplinker/class_info/runcanopus.py | 4 -- src/nplinker/config.py | 2 +- src/nplinker/genomics/abc.py | 7 ++-- .../genomics/antismash/antismash_loader.py | 3 +- .../antismash/podp_antismash_downloader.py | 10 +++-- src/nplinker/genomics/bgc.py | 2 +- src/nplinker/genomics/gcf.py | 2 +- src/nplinker/genomics/mibig/mibig_loader.py | 8 ++-- src/nplinker/genomics/utils.py | 16 ++++--- src/nplinker/loader.py | 20 ++++++--- src/nplinker/metabolomics/abc.py | 42 +++++++++++++------ .../metabolomics/gnps/gnps_downloader.py | 8 +--- .../metabolomics/gnps/gnps_extractor.py | 2 +- src/nplinker/metabolomics/gnps/gnps_format.py | 2 +- src/nplinker/metabolomics/molecular_family.py | 4 +- src/nplinker/metabolomics/spectrum.py | 4 +- src/nplinker/metabolomics/utils.py | 18 ++++---- src/nplinker/nplinker.py | 10 ++--- src/nplinker/schemas/user_strains.json | 4 +- src/nplinker/schemas/utils.py | 2 +- src/nplinker/scoring/abc.py | 2 +- src/nplinker/scoring/iokr/mk_fprints.py | 4 -- src/nplinker/scoring/iokr/spectrum_filters.py | 1 + src/nplinker/scoring/link_graph.py | 6 +-- src/nplinker/scoring/metcalf_scoring.py | 24 +++++++---- src/nplinker/scoring/utils.py | 31 ++++++-------- src/nplinker/strain/utils.py | 2 +- src/nplinker/utils.py | 26 ++++++++++-- tests/unit/genomics/test_mibig_downloader.py | 2 +- tests/unit/genomics/test_mibig_loader.py | 14 +++---- .../test_genome_bgc_mappings_schema.py | 4 +- .../unit/schemas/test_genome_status_schema.py | 4 +- .../schemas/test_strain_mappings_schema.py | 4 +- .../unit/schemas/test_user_strains_schema.py | 11 +++-- tests/unit/scoring/conftest.py | 7 +--- tests/unit/test_config.py | 2 +- tests/unit/test_utils.py | 10 ++--- 40 files changed, 188 insertions(+), 143 deletions(-) diff --git a/README.dev.md b/README.dev.md index 27b9967a..3f03ceaf 100644 --- a/README.dev.md +++ b/README.dev.md @@ -33,7 +33,7 @@ python3 -m pip install --upgrade pip setuptools # install development dependencies pip install --no-cache-dir --editable ".[dev]" -# install non-pypi dependecies +# install non-pypi dependencies install-nplinker-deps ``` diff --git a/pyproject.toml b/pyproject.toml index 7326a021..dd118f41 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ dev = [ "types-Deprecated", "types-beautifulsoup4", "types-jsonschema", + "types-networkx", "pandas-stubs", # docs "mkdocs", diff --git a/src/nplinker/arranger.py b/src/nplinker/arranger.py index e91cb726..06df9ac5 100644 --- a/src/nplinker/arranger.py +++ b/src/nplinker/arranger.py @@ -171,7 +171,7 @@ def _get_gnps_file_mappings_file(self) -> Path: file_mappings_tsv if file_mappings_tsv.exists() else file_mappings_csv ) - return gnps_file_mappings_file + return gnps_file_mappings_file # type: ignore def _download_and_extract_gnps(self) -> None: """Download and extract the GNPS data. @@ -304,7 +304,7 @@ def arrange_strain_mappings(self) -> None: If `self.config.mode` is "local", validate the strain mappings file. If `self.config.mode` is "podp", always generate the strain mappings file and validate it. - The valiation checks if the strain mappings file exists and if it is a valid JSON file + The validation checks if the strain mappings file exists and if it is a valid JSON file according to the schema defined in `schemas/strain_mappings_schema.json`. """ if self.config.mode == "podp": diff --git a/src/nplinker/class_info/runcanopus.py b/src/nplinker/class_info/runcanopus.py index 6108e7e5..17807278 100644 --- a/src/nplinker/class_info/runcanopus.py +++ b/src/nplinker/class_info/runcanopus.py @@ -81,7 +81,3 @@ def run_canopus(mgf_file, output_path, extra_params="--maxmz 600 formula zodiac open(os.path.join(output_path, "completed"), "w").close() return True - - -if __name__ == "__main__": - run_canopus(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4]) diff --git a/src/nplinker/config.py b/src/nplinker/config.py index 5e734ff5..24fba952 100644 --- a/src/nplinker/config.py +++ b/src/nplinker/config.py @@ -36,7 +36,7 @@ def load_config(config_file: str | PathLike) -> Dynaconf: # Note: -# Validataor parameter `required=False` means the setting (e.g. "loglevel") must not exist rather +# Validator parameter `required=False` means the setting (e.g. "loglevel") must not exist rather # than being optional. So don't set the parameter `required` if the key is optional. CONFIG_VALIDATORS = [ # General settings diff --git a/src/nplinker/genomics/abc.py b/src/nplinker/genomics/abc.py index 57c49ae4..daeb3bef 100644 --- a/src/nplinker/genomics/abc.py +++ b/src/nplinker/genomics/abc.py @@ -1,6 +1,5 @@ from abc import ABC from abc import abstractmethod -from collections.abc import Sequence from .bgc import BGC from .gcf import GCF @@ -8,7 +7,7 @@ class BGCLoaderBase(ABC): """Abstract base class for BGC loader.""" - def __init__(self, data_dir: str): + def __init__(self, data_dir: str) -> None: """Initialize the BGC loader. Args: @@ -26,7 +25,7 @@ def get_files(self) -> dict[str, str]: """ @abstractmethod - def get_bgcs(self) -> Sequence[BGC]: + def get_bgcs(self) -> list[BGC]: """Get BGC objects. Returns: @@ -38,7 +37,7 @@ class GCFLoaderBase(ABC): """Abstract base class for GCF loader.""" @abstractmethod - def get_gcfs(self, keep_mibig_only: bool, keep_singleton: bool) -> Sequence[GCF]: + def get_gcfs(self, keep_mibig_only: bool, keep_singleton: bool) -> list[GCF]: """Get GCF objects. Args: diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py index 08384817..4c6bd991 100644 --- a/src/nplinker/genomics/antismash/antismash_loader.py +++ b/src/nplinker/genomics/antismash/antismash_loader.py @@ -2,6 +2,7 @@ import fnmatch import logging import os +from typing import Mapping from Bio import SeqIO from Bio import SeqRecord from nplinker.genomics import BGC @@ -97,7 +98,7 @@ def get_bgcs(self) -> list[BGC]: return self._bgcs @staticmethod - def _parse_bgcs(bgc_files: dict[str, str]) -> list[BGC]: + def _parse_bgcs(bgc_files: Mapping[str, str]) -> list[BGC]: """Load given BGC files as BGC objects. Args: diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 515fdffe..2a77cc20 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -5,6 +5,8 @@ import time from os import PathLike from pathlib import Path +from typing import Mapping +from typing import Sequence import httpx from bs4 import BeautifulSoup from bs4 import NavigableString @@ -82,7 +84,7 @@ def read_json(file: str | PathLike) -> dict[str, "GenomeStatus"]: @staticmethod def to_json( - genome_status_dict: dict[str, "GenomeStatus"], file: str | PathLike | None = None + genome_status_dict: Mapping[str, "GenomeStatus"], file: str | PathLike | None = None ) -> str | None: """Convert the genome status dictionary to a JSON string. @@ -122,7 +124,7 @@ def _to_dict(self) -> dict: def podp_download_and_extract_antismash_data( - genome_records: list[dict[str, dict[str, str]]], + genome_records: Sequence[Mapping[str, Mapping[str, str]]], project_download_root: str | PathLike, project_extract_root: str | PathLike, ): @@ -220,7 +222,7 @@ def podp_download_and_extract_antismash_data( raise ValueError("No antiSMASH data found for any genome") -def get_best_available_genome_id(genome_id_data: dict[str, str]) -> str | None: +def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | None: """Get the best available ID from genome_id_data dict. Args: @@ -359,7 +361,7 @@ def _resolve_jgi_accession(jgi_id: str) -> str: return _resolve_genbank_accession(link.text) -def _resolve_refseq_id(genome_id_data: dict[str, str]) -> str: +def _resolve_refseq_id(genome_id_data: Mapping[str, str]) -> str: """Get the RefSeq ID to which the genome accession is linked. Check https://pairedomicsdata.bioinformatics.nl/schema.json. diff --git a/src/nplinker/genomics/bgc.py b/src/nplinker/genomics/bgc.py index e291cac0..12e6660e 100644 --- a/src/nplinker/genomics/bgc.py +++ b/src/nplinker/genomics/bgc.py @@ -2,11 +2,11 @@ import logging from typing import TYPE_CHECKING from deprecated import deprecated +from nplinker.strain import Strain from .aa_pred import predict_aa if TYPE_CHECKING: - from ..strain import Strain from .gcf import GCF logger = logging.getLogger(__name__) diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py index ecddd7a7..6a1e4f2f 100644 --- a/src/nplinker/genomics/gcf.py +++ b/src/nplinker/genomics/gcf.py @@ -1,11 +1,11 @@ from __future__ import annotations import logging from typing import TYPE_CHECKING +from nplinker.strain import Strain from nplinker.strain import StrainCollection if TYPE_CHECKING: - from nplinker.strain import Strain from .bgc import BGC logger = logging.getLogger(__name__) diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py index 38fed6b3..3d8eab6b 100644 --- a/src/nplinker/genomics/mibig/mibig_loader.py +++ b/src/nplinker/genomics/mibig/mibig_loader.py @@ -21,14 +21,14 @@ class MibigLoader: """ def __init__(self, data_dir: str): - """Initialize the MIBiG metatdata loader. + """Initialize the MIBiG metadata loader. Args: data_dir: Path to the directory of MIBiG metadata json files """ self.data_dir = data_dir self._file_dict = self.parse_data_dir(self.data_dir) - self._metadata_dict = self._parse_metadatas() + self._metadata_dict = self._parse_metadata() self._bgcs = self._parse_bgcs() def get_files(self) -> dict[str, str]: @@ -58,7 +58,7 @@ def parse_data_dir(data_dir: str) -> dict[str, str]: file_dict[fname] = file return file_dict - def get_metadatas(self) -> dict[str, MibigMetadata]: + def get_metadata(self) -> dict[str, MibigMetadata]: """Get MibigMetadata objects. Returns: @@ -66,7 +66,7 @@ def get_metadatas(self) -> dict[str, MibigMetadata]: """ return self._metadata_dict - def _parse_metadatas(self) -> dict[str, MibigMetadata]: + def _parse_metadata(self) -> dict[str, MibigMetadata]: """Parse all metadata files and return MibigMetadata objects. Returns: diff --git a/src/nplinker/genomics/utils.py b/src/nplinker/genomics/utils.py index ba4c227b..41b65316 100644 --- a/src/nplinker/genomics/utils.py +++ b/src/nplinker/genomics/utils.py @@ -3,6 +3,8 @@ import logging from os import PathLike from pathlib import Path +from typing import Mapping +from typing import Sequence from jsonschema import validate from nplinker.defaults import GENOME_BGC_MAPPINGS_FILENAME from nplinker.schemas import GENOME_BGC_MAPPINGS_SCHEMA @@ -65,7 +67,9 @@ def generate_mappings_genome_id_bgc_id( logger.info("Generated genome-BGC mappings file: %s", output_file) -def add_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]) -> tuple[list[BGC], list[BGC]]: +def add_strain_to_bgc( + strains: StrainCollection, bgcs: Sequence[BGC] +) -> tuple[list[BGC], list[BGC]]: """Assign a Strain object to `BGC.strain` for input BGCs. BGC id is used to find the corresponding Strain object. It's possible that @@ -111,7 +115,7 @@ def add_strain_to_bgc(strains: StrainCollection, bgcs: list[BGC]) -> tuple[list[ def add_bgc_to_gcf( - bgcs: list[BGC], gcfs: list[GCF] + bgcs: Sequence[BGC], gcfs: Sequence[GCF] ) -> tuple[list[GCF], list[GCF], dict[GCF, set[str]]]: """Add BGC objects to GCF object based on GCF's BGC ids. @@ -165,7 +169,7 @@ def add_bgc_to_gcf( return gcf_with_bgc, gcf_without_bgc, gcf_missing_bgc -def get_mibig_from_gcf(gcfs: list[GCF]) -> tuple[list[BGC], StrainCollection]: +def get_mibig_from_gcf(gcfs: Sequence[GCF]) -> tuple[list[BGC], StrainCollection]: """Get MIBiG BGCs and strains from GCF objects. Args: @@ -277,9 +281,9 @@ def extract_mappings_resolved_genome_id_bgc_id( def get_mappings_strain_id_bgc_id( - mappings_strain_id_original_genome_id: dict[str, set[str]], - mappings_original_genome_id_resolved_genome_id: dict[str, str], - mappings_resolved_genome_id_bgc_id: dict[str, set[str]], + mappings_strain_id_original_genome_id: Mapping[str, set[str]], + mappings_original_genome_id_resolved_genome_id: Mapping[str, str], + mappings_resolved_genome_id_bgc_id: Mapping[str, set[str]], ) -> dict[str, set[str]]: """Get mappings "strain_id <-> bgc_id". diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index ce534f52..3f248bec 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -1,9 +1,12 @@ +from __future__ import annotations import logging import os from deprecated import deprecated from dynaconf import Dynaconf from nplinker import NPLINKER_APP_DATA_DIR from nplinker import defaults +from nplinker.genomics import BGC +from nplinker.genomics import GCF from nplinker.genomics.antismash import AntismashBGCLoader from nplinker.genomics.bigscape import BigscapeGCFLoader from nplinker.genomics.bigscape import BigscapeV2GCFLoader @@ -11,6 +14,8 @@ from nplinker.genomics.utils import add_bgc_to_gcf from nplinker.genomics.utils import add_strain_to_bgc from nplinker.genomics.utils import get_mibig_from_gcf +from nplinker.metabolomics import MolecularFamily +from nplinker.metabolomics import Spectrum from nplinker.metabolomics.gnps import GNPSAnnotationLoader from nplinker.metabolomics.gnps import GNPSMolecularFamilyLoader from nplinker.metabolomics.gnps import GNPSSpectrumLoader @@ -58,11 +63,14 @@ def __init__(self, config: Dynaconf): """ self.config = config - self.bgcs, self.gcfs, self.spectra, self.mfs = [], [], [], [] - self.mibig_bgcs = [] - self.mibig_strains_in_use = StrainCollection() - self.product_types = [] - self.strains = StrainCollection() + self.bgcs: list[BGC] = [] + self.gcfs: list[GCF] = [] + self.spectra: list[Spectrum] = [] + self.mfs: list[MolecularFamily] = [] + self.mibig_bgcs: list[BGC] = [] + self.mibig_strains_in_use: StrainCollection = StrainCollection() + self.product_types: list = [] + self.strains: StrainCollection = StrainCollection() self.class_matches = None self.chem_classes = None @@ -93,7 +101,7 @@ def _load_strain_mappings(self): self.strains.add(strain) logger.info("Loaded {} non-MiBIG Strain objects".format(len(self.strains))) - # 2. filter user specificied strains (remove all that are not specified by user). + # 2. filter user specified strains (remove all that are not specified by user). # It's not allowed to specify empty list of strains, otherwise validation will fail. user_strains_file = self.config.root_dir / defaults.STRAINS_SELECTED_FILENAME if user_strains_file.exists(): diff --git a/src/nplinker/metabolomics/abc.py b/src/nplinker/metabolomics/abc.py index 5cadb0e4..6af3052e 100644 --- a/src/nplinker/metabolomics/abc.py +++ b/src/nplinker/metabolomics/abc.py @@ -1,23 +1,27 @@ from abc import ABC from abc import abstractmethod -from collections.abc import Sequence -from typing import TYPE_CHECKING - - -if TYPE_CHECKING: - from .molecular_family import MolecularFamily - from .spectrum import Spectrum +from .molecular_family import MolecularFamily +from .spectrum import Spectrum class SpectrumLoaderBase(ABC): + """Abstract base class for SpectrumLoader.""" + @property @abstractmethod - def spectra(self) -> Sequence["Spectrum"]: ... + def spectra(self) -> list["Spectrum"]: + """Get Spectrum objects. + + Returns: + A sequence of Spectrum objects. + """ class MolecularFamilyLoaderBase(ABC): + """Abstract base class for MolecularFamilyLoader.""" + @abstractmethod - def get_mfs(self, keep_singleton: bool) -> Sequence["MolecularFamily"]: + def get_mfs(self, keep_singleton: bool) -> list["MolecularFamily"]: """Get MolecularFamily objects. Args: @@ -26,17 +30,31 @@ def get_mfs(self, keep_singleton: bool) -> Sequence["MolecularFamily"]: only one spectrum. Returns: - A list of MolecularFamily objects. + A sequence of MolecularFamily objects. """ class FileMappingLoaderBase(ABC): + """Abstract base class for FileMappingLoader.""" + @property @abstractmethod - def mappings(self) -> dict[str, list[str]]: ... + def mappings(self) -> dict[str, list[str]]: + """Get file mappings. + + Returns: + A mapping from spectrum ID to the names of files where the spectrum occurs. + """ class AnnotationLoaderBase(ABC): + """Abstract base class for AnnotationLoader.""" + @property @abstractmethod - def annotations(self) -> dict[str, dict]: ... + def annotations(self) -> dict[str, dict]: + """Get annotations. + + Returns: + A mapping from spectrum ID to its annotations. + """ diff --git a/src/nplinker/metabolomics/gnps/gnps_downloader.py b/src/nplinker/metabolomics/gnps/gnps_downloader.py index b0febe07..365d60fb 100644 --- a/src/nplinker/metabolomics/gnps/gnps_downloader.py +++ b/src/nplinker/metabolomics/gnps/gnps_downloader.py @@ -1,16 +1,12 @@ from __future__ import annotations from os import PathLike from pathlib import Path -from typing import TYPE_CHECKING +from typing_extensions import Self from nplinker.utils import download_url from .gnps_format import GNPSFormat from .gnps_format import gnps_format_from_task_id -if TYPE_CHECKING: - from typing_extensions import Self - - class GNPSDownloader: """Download GNPS zip archive for the given task id. @@ -92,7 +88,7 @@ def get_task_id(self) -> str: return self._task_id def get_url(self) -> str: - """Get the full URL linking to GNPS data to be dowloaded. + """Get the full URL linking to GNPS data to be downloaded. Returns: URL pointing to the GNPS data to be downloaded. diff --git a/src/nplinker/metabolomics/gnps/gnps_extractor.py b/src/nplinker/metabolomics/gnps/gnps_extractor.py index 7d71f089..f393e830 100644 --- a/src/nplinker/metabolomics/gnps/gnps_extractor.py +++ b/src/nplinker/metabolomics/gnps/gnps_extractor.py @@ -19,7 +19,7 @@ class GNPSExtractor: - annotations.tsv The files to be extracted are selected based on the GNPS workflow type, - as desribed below (in the order of the files above): + as described below (in the order of the files above): 1. METABOLOMICS-SNETS - clusterinfosummarygroup_attributes_withIDs_withcomponentID/*.tsv diff --git a/src/nplinker/metabolomics/gnps/gnps_format.py b/src/nplinker/metabolomics/gnps/gnps_format.py index 5014682d..96fa083e 100644 --- a/src/nplinker/metabolomics/gnps/gnps_format.py +++ b/src/nplinker/metabolomics/gnps/gnps_format.py @@ -105,7 +105,7 @@ def gnps_format_from_archive(zip_file: str | PathLike) -> GNPSFormat: def gnps_format_from_file_mapping(file: str | PathLike) -> GNPSFormat: """Detect GNPS format from the given file mapping file. - The GNSP file mapping file is located in different folders depending on the + The GNPS file mapping file is located in different folders depending on the GNPS workflow. Here are the locations in corresponding GNPS zip archives: - METABOLOMICS-SNETS workflow: the .tsv file under folder "clusterinfosummarygroup_attributes_withIDs_withcomponentID" diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py index 16d9bd6a..7988aa0c 100644 --- a/src/nplinker/metabolomics/molecular_family.py +++ b/src/nplinker/metabolomics/molecular_family.py @@ -1,7 +1,7 @@ from __future__ import annotations from typing import TYPE_CHECKING -from ..strain.strain import Strain -from ..strain.strain_collection import StrainCollection +from nplinker.strain import Strain +from nplinker.strain import StrainCollection if TYPE_CHECKING: diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 5ec7ccb2..841fe2a8 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -19,7 +19,7 @@ class Spectrum: intensity: the list of intensity values. precursor_mz: the m/z value of the precursor. rt: the retention time in seconds. - metadata: the metadata of the spectrum, i.e. the header infomation in the MGF + metadata: the metadata of the spectrum, i.e. the header information in the MGF file. gnps_annotations: the GNPS annotations of the spectrum. gnps_id: the GNPS ID of the spectrum. @@ -45,7 +45,7 @@ def __init__( intensity: the list of intensity values. precursor_mz: the precursor m/z. rt: the retention time in seconds. Defaults to 0. - metadata: the metadata of the spectrum, i.e. the header infomation + metadata: the metadata of the spectrum, i.e. the header information in the MGF file. """ self.id = id diff --git a/src/nplinker/metabolomics/utils.py b/src/nplinker/metabolomics/utils.py index a8a53aef..1110fb7d 100644 --- a/src/nplinker/metabolomics/utils.py +++ b/src/nplinker/metabolomics/utils.py @@ -3,6 +3,8 @@ import logging from os import PathLike from pathlib import Path +from typing import Mapping +from typing import Sequence from nplinker.schemas import validate_podp_json from nplinker.strain import StrainCollection from .gnps.gnps_file_mapping_loader import GNPSFileMappingLoader @@ -13,8 +15,10 @@ logger = logging.getLogger(__name__) -def add_annotation_to_spectrum(annotations: dict[str, dict], spectra: list[Spectrum]) -> None: - """Add GNPS annotations to the `Spectrum.gnps_annotaions` attribute for input spectra. +def add_annotation_to_spectrum( + annotations: Mapping[str, dict], spectra: Sequence[Spectrum] +) -> None: + """Add GNPS annotations to the `Spectrum.gnps_annotations` attribute for input spectra. It is possible that some spectra don't have annotations. Note that the input `spectra` list is changed in place. @@ -30,7 +34,7 @@ def add_annotation_to_spectrum(annotations: dict[str, dict], spectra: list[Spect def add_strains_to_spectrum( - strains: StrainCollection, spectra: list[Spectrum] + strains: StrainCollection, spectra: Sequence[Spectrum] ) -> tuple[list[Spectrum], list[Spectrum]]: """Add `Strain` objects to the `Spectrum.strains` attribute for input spectra. @@ -45,7 +49,7 @@ def add_strains_to_spectrum( - the first list contains Spectrum objects that are updated with Strain objects; - the second list contains Spectrum objects that are not updated with Strain objects - becuase no Strain objects are found. + because no Strain objects are found. """ spectra_with_strains = [] spectra_without_strains = [] @@ -69,7 +73,7 @@ def add_strains_to_spectrum( def add_spectrum_to_mf( - spectra: list[Spectrum], mfs: list[MolecularFamily] + spectra: Sequence[Spectrum], mfs: Sequence[MolecularFamily] ) -> tuple[list[MolecularFamily], list[MolecularFamily], dict[MolecularFamily, set[str]]]: """Add Spectrum objects to MolecularFamily objects. @@ -186,8 +190,8 @@ def extract_mappings_ms_filename_spectrum_id( def get_mappings_strain_id_spectrum_id( - mappings_strain_id_ms_filename: dict[str, set[str]], - mappings_ms_filename_spectrum_id: dict[str, set[str]], + mappings_strain_id_ms_filename: Mapping[str, set[str]], + mappings_ms_filename_spectrum_id: Mapping[str, set[str]], ) -> dict[str, set[str]]: """Get mappings "strain_id <-> spectrum_id". diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 60e20be7..33e07bb7 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -1,7 +1,8 @@ from __future__ import annotations import logging +from os import PathLike from pprint import pformat -from typing import TYPE_CHECKING +from typing import Sequence from typing import TypeVar from typing import overload from . import setup_logging @@ -13,16 +14,11 @@ from .loader import DatasetLoader from .metabolomics import MolecularFamily from .metabolomics import Spectrum +from .scoring.link_graph import LinkGraph from .scoring.metcalf_scoring import MetcalfScoring from .strain import StrainCollection -if TYPE_CHECKING: - from os import PathLike - from typing import Sequence - from nplinker.scoring.link_graph import LinkGraph - - logger = logging.getLogger(__name__) ObjectType = TypeVar("ObjectType", BGC, GCF, Spectrum, MolecularFamily) diff --git a/src/nplinker/schemas/user_strains.json b/src/nplinker/schemas/user_strains.json index 64949566..35b0e7af 100644 --- a/src/nplinker/schemas/user_strains.json +++ b/src/nplinker/schemas/user_strains.json @@ -1,7 +1,7 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", "$id": "https://raw.githubusercontent.com/NPLinker/nplinker/main/src/nplinker/schemas/user_strains.json", - "title": "User specificed strains", + "title": "User specified strains", "description": "A list of strain IDs specified by user", "type": "object", "required": [ @@ -11,7 +11,7 @@ "strain_ids": { "type": "array", "title": "Strain IDs", - "description": "A list of strain IDs specificed by user. The strain IDs must be the same as the ones in the strain mappings file.", + "description": "A list of strain IDs specified by user. The strain IDs must be the same as the ones in the strain mappings file.", "items": { "type": "string", "minLength": 1 diff --git a/src/nplinker/schemas/utils.py b/src/nplinker/schemas/utils.py index ce47d946..c0a9bd23 100644 --- a/src/nplinker/schemas/utils.py +++ b/src/nplinker/schemas/utils.py @@ -13,7 +13,7 @@ def validate_podp_json(json_data: dict) -> None: All validation error messages are collected and raised as a single ValueError. - Parameters: + Args: json_data: The JSON data to validate. Raises: diff --git a/src/nplinker/scoring/abc.py b/src/nplinker/scoring/abc.py index fa287190..cad3a88e 100644 --- a/src/nplinker/scoring/abc.py +++ b/src/nplinker/scoring/abc.py @@ -3,11 +3,11 @@ from abc import ABC from abc import abstractmethod from typing import TYPE_CHECKING +from .link_graph import LinkGraph if TYPE_CHECKING: from nplinker.nplinker import NPLinker - from .link_graph import LinkGraph logger = logging.getLogger(__name__) diff --git a/src/nplinker/scoring/iokr/mk_fprints.py b/src/nplinker/scoring/iokr/mk_fprints.py index 13874e19..a0abf7a4 100644 --- a/src/nplinker/scoring/iokr/mk_fprints.py +++ b/src/nplinker/scoring/iokr/mk_fprints.py @@ -101,7 +101,3 @@ def fingerprint_from_inchi(inchi, fingerprint_type=None): for fp_bit in range(fp_size): fp_array[fp_bit] = fp.get(fp_bit) return fp_array - - -if __name__ == "__main__": - main() diff --git a/src/nplinker/scoring/iokr/spectrum_filters.py b/src/nplinker/scoring/iokr/spectrum_filters.py index 195feac0..eeb4da50 100644 --- a/src/nplinker/scoring/iokr/spectrum_filters.py +++ b/src/nplinker/scoring/iokr/spectrum_filters.py @@ -16,6 +16,7 @@ import os import pickle import numpy + # import sys # sys.path.append('/home/grimur/git/lda') # from lda.code.formula import Formula diff --git a/src/nplinker/scoring/link_graph.py b/src/nplinker/scoring/link_graph.py index 96596dc4..ce1a06b6 100644 --- a/src/nplinker/scoring/link_graph.py +++ b/src/nplinker/scoring/link_graph.py @@ -83,7 +83,7 @@ class LinkGraph: """ def __init__(self) -> None: - self._g = Graph() + self._g: Graph = Graph() def __str__(self) -> str: """Get a short summary of the LinkGraph.""" @@ -113,7 +113,7 @@ def __getitem__( except KeyError: raise KeyError(f"{u} not found in the link graph.") - return {**links} + return {**links} # type: ignore @property def links( @@ -191,4 +191,4 @@ def get_link_data( A dictionary of scoring methods and their data for the link between the two objects, or None if there is no link between the two objects. """ - return self._g.get_edge_data(u, v) + return self._g.get_edge_data(u, v) # type: ignore diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index 9c1df113..9985b710 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -3,6 +3,7 @@ from enum import Enum from typing import TYPE_CHECKING from typing import TypeVar +from typing import overload import numpy as np import pandas as pd from scipy.stats import hypergeom @@ -19,7 +20,7 @@ if TYPE_CHECKING: - from ..nplinker import NPLinker + from nplinker.nplinker import NPLinker logger = logging.getLogger(__name__) @@ -117,14 +118,14 @@ def setup(cls, npl: NPLinker): cls.presence_spec_strain, cls.presence_gcf_strain, cls.metcalf_weights ) cls.raw_score_spec_gcf = raw_score_spec_gcf.reset_index().melt(id_vars="index") - cls.raw_score_spec_gcf.columns = ["spec", "gcf", "score"] + cls.raw_score_spec_gcf.columns = ["spec", "gcf", "score"] # type: ignore # calculate raw Metcalf scores for spec-gcf links raw_score_mf_gcf = cls._calc_raw_score( cls.presence_mf_strain, cls.presence_gcf_strain, cls.metcalf_weights ) cls.raw_score_mf_gcf = raw_score_mf_gcf.reset_index().melt(id_vars="index") - cls.raw_score_mf_gcf.columns = ["mf", "gcf", "score"] + cls.raw_score_mf_gcf.columns = ["mf", "gcf", "score"] # type: ignore # calculate mean and std for standardising Metcalf scores cls.metcalf_mean, cls.metcalf_std = cls._calc_mean_std( @@ -133,7 +134,14 @@ def setup(cls, npl: NPLinker): logger.info("MetcalfScoring.setup completed") - def get_links(self, *objects: ObjectType, **parameters) -> LinkGraph: + @overload + def get_links(self, *objects: GCF, **parameters) -> LinkGraph: ... + @overload + def get_links(self, *objects: Spectrum, **parameters) -> LinkGraph: ... + @overload + def get_links(self, *objects: MolecularFamily, **parameters) -> LinkGraph: ... + + def get_links(self, *objects, **parameters): """Get links for the given objects. Args: @@ -348,11 +356,11 @@ def _calc_standardised_score(self, raw_scores: list[pd.DataFrame]) -> list[pd.Da for row in raw_score_df.itertuples(index=False): met = row.spec if raw_score_df.name == LinkType.SPEC_GCF else row.mf - n_gcf_strains = len(row.gcf.strains) - n_met_strains = len(met.strains) + n_gcf_strains = len(row.gcf.strains) # type: ignore + n_met_strains = len(met.strains) # type: ignore - mean = self.metcalf_mean[n_met_strains][n_gcf_strains] - sqrt = self.metcalf_std[n_met_strains][n_gcf_strains] + mean = self.metcalf_mean[n_met_strains][n_gcf_strains] # type: ignore + sqrt = self.metcalf_std[n_met_strains][n_gcf_strains] # type: ignore z_score = (row.score - mean) / sqrt diff --git a/src/nplinker/scoring/utils.py b/src/nplinker/scoring/utils.py index 418fc1ec..35b2fbff 100644 --- a/src/nplinker/scoring/utils.py +++ b/src/nplinker/scoring/utils.py @@ -1,15 +1,10 @@ from __future__ import annotations -from typing import TYPE_CHECKING from typing import Sequence -import numpy as np import pandas as pd - - -if TYPE_CHECKING: - from nplinker.genomics import GCF - from nplinker.metabolomics import MolecularFamily - from nplinker.metabolomics import Spectrum - from nplinker.strain import StrainCollection +from nplinker.genomics import GCF +from nplinker.metabolomics import MolecularFamily +from nplinker.metabolomics import Spectrum +from nplinker.strain import StrainCollection def get_presence_gcf_strain(gcfs: Sequence[GCF], strains: StrainCollection) -> pd.DataFrame: @@ -19,16 +14,16 @@ def get_presence_gcf_strain(gcfs: Sequence[GCF], strains: StrainCollection) -> p values are 1 if the gcf occurs in the strain, 0 otherwise. """ df_gcf_strain = pd.DataFrame( - np.zeros((len(gcfs), len(strains))), + 0, index=gcfs, columns=list(strains), dtype=int, - ) + ) # type: ignore for gcf in gcfs: for strain in strains: if gcf.has_strain(strain): df_gcf_strain.loc[gcf, strain] = 1 - return df_gcf_strain + return df_gcf_strain # type: ignore def get_presence_spec_strain( @@ -40,16 +35,16 @@ def get_presence_spec_strain( the values are 1 if the spectrum occurs in the strain, 0 otherwise. """ df_spec_strain = pd.DataFrame( - np.zeros((len(spectra), len(strains))), + 0, index=spectra, columns=list(strains), dtype=int, - ) + ) # type: ignore for spectrum in spectra: for strain in strains: if spectrum.has_strain(strain): df_spec_strain.loc[spectrum, strain] = 1 - return df_spec_strain + return df_spec_strain # type: ignore def get_presence_mf_strain( @@ -61,13 +56,13 @@ def get_presence_mf_strain( columns, and the values are 1 if the molecular family occurs in the strain, 0 otherwise. """ df_mf_strain = pd.DataFrame( - np.zeros((len(mfs), len(strains))), + 0, index=mfs, columns=list(strains), dtype=int, - ) + ) # type: ignore for mf in mfs: for strain in strains: if mf.has_strain(strain): df_mf_strain.loc[mf, strain] = 1 - return df_mf_strain + return df_mf_strain # type: ignore diff --git a/src/nplinker/strain/utils.py b/src/nplinker/strain/utils.py index 0443067e..25af274a 100644 --- a/src/nplinker/strain/utils.py +++ b/src/nplinker/strain/utils.py @@ -97,7 +97,7 @@ def podp_generate_strain_mappings( "MS_filename <-> spectrum_id". - `get_mappings_strain_id_spectrum_id`: Get mappings "strain_id <-> spectrum_id". """ - # Get mappings strain_id <-> original_geonme_id <-> resolved_genome_id <-> bgc_id + # Get mappings strain_id <-> original_genome_id <-> resolved_genome_id <-> bgc_id mappings_strain_id_bgc_id = get_mappings_strain_id_bgc_id( extract_mappings_strain_id_original_genome_id(podp_project_json_file), extract_mappings_original_genome_id_resolved_genome_id(genome_status_json_file), diff --git a/src/nplinker/utils.py b/src/nplinker/utils.py index aedccfa4..c28d14fd 100644 --- a/src/nplinker/utils.py +++ b/src/nplinker/utils.py @@ -27,7 +27,7 @@ from os import PathLike from pathlib import Path from typing import IO -from typing import Callable +from typing import Callable, Sequence import httpx from rich.progress import BarColumn from rich.progress import DownloadColumn @@ -109,6 +109,15 @@ def is_file_format(file: str | PathLike, format: str = "tsv") -> bool: def calculate_md5(fpath: str | PathLike, chunk_size: int = 1024 * 1024) -> str: + """Calculate the MD5 checksum of a file. + + Args: + fpath: Path to the file. + chunk_size: Chunk size for reading the file. Defaults to 1024*1024. + + Returns: + MD5 checksum of the file. + """ if sys.version_info >= (3, 9): md5 = hashlib.md5(usedforsecurity=False) else: @@ -120,6 +129,15 @@ def calculate_md5(fpath: str | PathLike, chunk_size: int = 1024 * 1024) -> str: def check_md5(fpath: str | PathLike, md5: str) -> bool: + """Verify the MD5 checksum of a file. + + Args: + fpath: Path to the file. + md5: MD5 checksum to verify. + + Returns: + True if the MD5 checksum matches, False otherwise. + """ return md5 == calculate_md5(fpath) @@ -238,7 +256,7 @@ def list_files( def _extract_tar( from_path: str | PathLike, to_path: str | PathLike, - members: list[tarfile.TarInfo] | None, + members: Sequence[tarfile.TarInfo] | None, compression: str | None, ) -> None: with tarfile.open(from_path, f"r:{compression[1:]}" if compression else "r") as tar: @@ -254,7 +272,7 @@ def _extract_tar( def _extract_zip( from_path: str | PathLike, to_path: str | PathLike, - members: list[str | zipfile.ZipInfo] | None, + members: Sequence[str | zipfile.ZipInfo] | None, compression: str | None, ) -> None: with zipfile.ZipFile( @@ -380,7 +398,7 @@ def extract_archive( If omitted, the directory of the archive file is used. members: Optional selection of members to extract. If not specified, all members are extracted. - Memers must be a subset of the list returned by + Members must be a subset of the list returned by - `zipfile.ZipFile.namelist()` or a list of strings for zip file - `tarfile.TarFile.getmembers()` for tar file remove_finished: If `True`, remove the file after the extraction. diff --git a/tests/unit/genomics/test_mibig_downloader.py b/tests/unit/genomics/test_mibig_downloader.py index 2e678ba2..2c00c2b4 100644 --- a/tests/unit/genomics/test_mibig_downloader.py +++ b/tests/unit/genomics/test_mibig_downloader.py @@ -2,7 +2,7 @@ from nplinker.genomics import mibig -class TestDownloadAndExtractMibigMetadatas: +class TestDownloadAndExtractMibigMetadata: def test_default(self, tmp_path): download_path = tmp_path / "download" extract_path = tmp_path / "metadata" diff --git a/tests/unit/genomics/test_mibig_loader.py b/tests/unit/genomics/test_mibig_loader.py index 1188971b..8722a6e0 100644 --- a/tests/unit/genomics/test_mibig_loader.py +++ b/tests/unit/genomics/test_mibig_loader.py @@ -48,13 +48,13 @@ def test_parse_data_dir(self, data_dir): assert isinstance(files["BGC0000001"], str) assert os.path.exists(files["BGC0000001"]) - def test_get_metadatas(self, loader): - metadatas = loader.get_metadatas() - assert isinstance(metadatas, dict) - assert len(metadatas) == 2502 # MIBiG v3.1 has 2502 BGCs - assert "BGC0000001" in metadatas - assert "BGC0000246" not in metadatas - assert isinstance(metadatas["BGC0000001"], MibigMetadata) + def test_get_metadata(self, loader): + metadata = loader.get_metadata() + assert isinstance(metadata, dict) + assert len(metadata) == 2502 # MIBiG v3.1 has 2502 BGCs + assert "BGC0000001" in metadata + assert "BGC0000246" not in metadata + assert isinstance(metadata["BGC0000001"], MibigMetadata) def test_get_bgcs(self, loader): bgcs = loader.get_bgcs() diff --git a/tests/unit/schemas/test_genome_bgc_mappings_schema.py b/tests/unit/schemas/test_genome_bgc_mappings_schema.py index 4617558c..ebc1e6d8 100644 --- a/tests/unit/schemas/test_genome_bgc_mappings_schema.py +++ b/tests/unit/schemas/test_genome_bgc_mappings_schema.py @@ -47,7 +47,7 @@ } -# Test schema aginast invalid data +# Test schema against invalid data @pytest.mark.parametrize( "data, expected", [ @@ -72,7 +72,7 @@ def test_invalid_data(data, expected): assert e.value.message == expected -# Test schema aginast valid data +# Test schema against valid data def test_valid_data(): data = { "mappings": [ diff --git a/tests/unit/schemas/test_genome_status_schema.py b/tests/unit/schemas/test_genome_status_schema.py index cdea276f..de0679a5 100644 --- a/tests/unit/schemas/test_genome_status_schema.py +++ b/tests/unit/schemas/test_genome_status_schema.py @@ -103,7 +103,7 @@ } -# Test schema aginast invalid data +# Test schema against invalid data @pytest.mark.parametrize( "data, expected", [ @@ -129,7 +129,7 @@ def test_invalid_data(data, expected): assert e.value.message == expected -# Test schema aginast valid data +# Test schema against valid data def test_valid_data(): data = { "genome_status": [ diff --git a/tests/unit/schemas/test_strain_mappings_schema.py b/tests/unit/schemas/test_strain_mappings_schema.py index 5a45f979..46d00d0c 100644 --- a/tests/unit/schemas/test_strain_mappings_schema.py +++ b/tests/unit/schemas/test_strain_mappings_schema.py @@ -58,7 +58,7 @@ } -# Test schema aginast invalid data +# Test schema against invalid data @pytest.mark.parametrize( "data, expected", [ @@ -83,7 +83,7 @@ def test_invalid_data(data, expected): assert e.value.message == expected -# Test schema aginast valid data +# Test schema against valid data def test_valid_data(): data = { "strain_mappings": [ diff --git a/tests/unit/schemas/test_user_strains_schema.py b/tests/unit/schemas/test_user_strains_schema.py index 423b84e0..5a4e37ac 100644 --- a/tests/unit/schemas/test_user_strains_schema.py +++ b/tests/unit/schemas/test_user_strains_schema.py @@ -4,10 +4,15 @@ from nplinker.schemas import USER_STRAINS_SCHEMA -# Test schema aginast invalid data +# Test schema against invalid data data_no_strain_ids = {"version": "1.0"} data_empty_strain_ids = {"strain_ids": [], "version": "1.0"} -data_invalid_strain_ids = {"strain_ids": [1, ], "version": "1.0"} +data_invalid_strain_ids = { + "strain_ids": [ + 1, + ], + "version": "1.0", +} data_empty_version = {"strain_ids": ["strain1", "strain2"], "version": ""} data_invalid_version = {"strain_ids": ["strain1", "strain2"], "version": "1.0.0"} @@ -29,7 +34,7 @@ def test_invalid_data(data, expected): assert e.value.message == expected -# Test schema aginast valid data +# Test schema against valid data data = {"strain_ids": ["strain1", "strain2"], "version": "1.0"} data_no_version = {"strain_ids": ["strain1", "strain2"]} diff --git a/tests/unit/scoring/conftest.py b/tests/unit/scoring/conftest.py index ec2f4ad6..009a9252 100644 --- a/tests/unit/scoring/conftest.py +++ b/tests/unit/scoring/conftest.py @@ -69,13 +69,10 @@ def npl(gcfs, spectra, mfs, strains, tmp_path) -> NPLinker: manually set its attributes to the values we want to test. The config file `nplinker_demo1.toml` does not affect the tests, just - making sure the NPLinker object can be created succesfully. + making sure the NPLinker object can be created successfully. """ - os.environ["NPLINKER_ROOT_DIR"] = str(tmp_path) # Create a tmporary root dir for NPLinker + os.environ["NPLINKER_ROOT_DIR"] = str(tmp_path) # Create a temporary root dir for NPLinker npl = NPLinker(CONFIG_FILE_LOCAL_MODE) - npl._gcfs = gcfs - npl._spectra = spectra - npl._mfs = mfs npl._strains = strains npl._gcf_dict = {gcf.id: gcf for gcf in gcfs} npl._mf_dict = {mf.id: mf for mf in mfs} diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index f681cc92..8ced67c3 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -5,7 +5,7 @@ def test_config(tmp_path): """Test loading the default config file.""" - os.environ["NPLINKER_ROOT_DIR"] = str(tmp_path) # Create a tmporary root dir for NPLinker + os.environ["NPLINKER_ROOT_DIR"] = str(tmp_path) # Create a temporary root dir for NPLinker config = load_config(CONFIG_FILE_LOCAL_MODE) assert config.mode == "local" diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 0242ebe8..29eaf056 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -17,7 +17,7 @@ def test_find_delimiter(filename, expected): BGC_GBK_URL = "https://mibig.secondarymetabolites.org/repository/BGC0000001/BGC0000001.gbk" -MIBIG_METADATAS_URL = "https://dl.secondarymetabolites.org/mibig/mibig_json_3.1.tar.gz" +MIBIG_METADATA_URL = "https://dl.secondarymetabolites.org/mibig/mibig_json_3.1.tar.gz" ROOT = Path(__file__).parent @@ -47,7 +47,7 @@ class TestExtractArchive: @pytest.fixture def archive(self): temppath = mkdtemp() - utils.download_url(MIBIG_METADATAS_URL, temppath) + utils.download_url(MIBIG_METADATA_URL, temppath) archive = Path(temppath) / "mibig_json_3.1.tar.gz" yield archive @@ -82,7 +82,7 @@ def temppath2(self): rmtree(temppath) def test_defaults(self, temppath1): - utils.download_and_extract_archive(url=MIBIG_METADATAS_URL, download_root=temppath1) + utils.download_and_extract_archive(url=MIBIG_METADATA_URL, download_root=temppath1) fdownload = Path(temppath1) / "mibig_json_3.1.tar.gz" fextract = Path(temppath1) / "mibig_json_3.1" @@ -92,7 +92,7 @@ def test_defaults(self, temppath1): def test_optional_args(self, temppath1, temppath2): utils.download_and_extract_archive( - url=MIBIG_METADATAS_URL, + url=MIBIG_METADATA_URL, download_root=temppath1, extract_root=temppath2, filename="example.tar.gz", @@ -108,7 +108,7 @@ def test_optional_args(self, temppath1, temppath2): def test_arg_remove_finished(self, temppath1): utils.download_and_extract_archive( - url=MIBIG_METADATAS_URL, download_root=temppath1, remove_finished=True + url=MIBIG_METADATA_URL, download_root=temppath1, remove_finished=True ) fdownload = Path(temppath1) / "mibig_json_3.1.tar.gz"