From 06fe1f91952ba2d845287d354fc5f11a65227b25 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 14 Jun 2024 13:31:54 +0200 Subject: [PATCH] rename `molfam` to `mf` to uniform the abbreviation for MolecularFamily (#255) --- src/nplinker/class_info/chem_classes.py | 106 ++++++++++----------- src/nplinker/loader.py | 20 ++-- src/nplinker/nplinker.py | 10 +- src/nplinker/pickler.py | 2 +- src/nplinker/scoring/metcalf_scoring.py | 4 +- src/nplinker/scoring/np_class_scoring.py | 32 +++---- tests/integration/test_nplinker_local.py | 2 +- tests/unit/class_info/test_chem_classes.py | 10 +- tests/unit/scoring/conftest.py | 2 +- 9 files changed, 92 insertions(+), 96 deletions(-) diff --git a/src/nplinker/class_info/chem_classes.py b/src/nplinker/class_info/chem_classes.py index 5bf1327d5..c7bbb576f 100644 --- a/src/nplinker/class_info/chem_classes.py +++ b/src/nplinker/class_info/chem_classes.py @@ -45,7 +45,7 @@ def __init__(self, canopus_dir, mne_dir, gnps_dir): class_predict_options = [] if self._canopus.spectra_classes: class_predict_options.append("canopus") - if self._molnetenhancer.spectra2molfam: + if self._molnetenhancer.spectra2mf: class_predict_options.append("molnetenhancer") if class_predict_options: class_predict_options = ["mix", "main"] + class_predict_options @@ -70,7 +70,7 @@ class CanopusResults: The results from the canopus dir are read and combined with the MN from GNPS using canopus_treemap: github.com/louwenjjr/canopus_treemap/tree/master/canopus - This creates the two files that are read for the spectra and molfams: + This creates the two files that are read for the spectra and mfs: -cluster_index_classifications.txt -component_index_classifications.txt @@ -91,7 +91,7 @@ def __init__(self, canopus_dir, gnps_dir): """ self._canopus_dir = canopus_dir self._gnps_dir = gnps_dir - self._molfam_classes, self._molfam_classes_names, self._molfam_classes_names_inds = ( + self._mf_classes, self._mf_classes_names, self._mf_classes_names_inds = ( None, None, None, @@ -163,12 +163,10 @@ def _read_all_classes(self): spectra_classes_names, spectra_classes = self._read_spectra_classes(ci_file) if os.path.isfile(compi_file): - molfam_classes_names, molfam_classes = self._read_molfam_classes(compi_file) - self._molfam_classes = molfam_classes - self._molfam_classes_names = molfam_classes_names - self._molfam_classes_names_inds = { - elem: i for i, elem in enumerate(molfam_classes_names) - } + mf_classes_names, mf_classes = self._read_mf_classes(compi_file) + self._mf_classes = mf_classes + self._mf_classes_names = mf_classes_names + self._mf_classes_names_inds = {elem: i for i, elem in enumerate(mf_classes_names)} else: # use canopus output correctly (only for spectra) logger.info( @@ -176,8 +174,8 @@ def _read_all_classes(self): "canopus_dir (canopus_summary.tsv)" ) spectra_classes_names, spectra_classes = self._read_spectra_classes_directly() - # molfams have to be added later with info about molfam <- spectra - # this happens with transfer_spec_classes_to_molfams() in loader.py + # mfs have to be added later with info about mf <- spectra + # this happens with transfer_spec_classes_to_mfs() in loader.py self._spectra_classes = spectra_classes self._spectra_classes_names = spectra_classes_names @@ -331,15 +329,15 @@ class prediction for a level. When no class is present, instead of Tuple it will outf.write("\t".join(output_l) + "\n") return can_classes_names, can_classes - def _read_molfam_classes(self, input_file): - """Read canopus classes for molfams, return classes_names, classes. + def _read_mf_classes(self, input_file): + """Read canopus classes for mfs, return classes_names, classes. Args: input_file: str, component_index_classifications.txt Returns: Tuple of: - compi_classes_names: list of str - the names of each different level - - compi_classes: dict of {str: lists of tuple(str, float)} - per molfam index (key) the classes for each level + - compi_classes: dict of {str: lists of tuple(str, float)} - per mf index (key) the classes for each level where each level is a list of (class_name, fraction) sorted on best choice so index 0 is the best class prediction for a level. When no class is present, instead of Tuple it will be None for that level. """ @@ -376,33 +374,33 @@ class prediction for a level. When no class is present, instead of Tuple it will ] return compi_classes_names, compi_classes - def transfer_spec_classes_to_molfams(self, molfams, fraction_cutoff=0.0): - """Set _molfam_classes(_names) from spectra_classes and return classes. + def transfer_spec_classes_to_mfs(self, mfs, fraction_cutoff=0.0): + """Set _mf_classes(_names) from spectra_classes and return classes. - This can be used in the _loader to get molfam classes when the GNPS MN + This can be used in the _loader to get mf classes when the GNPS MN version is too old and canopus_treemap fails to work directly. Args: - molfams: list of MolecularFamily from the NPLinker space + mfs: list of MolecularFamily from the NPLinker space fraction_cutoff: float, cut-off for the fraction of class terms - needed to be included in the molfam + needed to be included in the mf Returns: - dict of {str: lists of tuple(str, float)} - per molfam (key) the classes for each level + dict of {str: lists of tuple(str, float)} - per mf (key) the classes for each level where each level is a list of (class_name, fraction) sorted on best choice so index 0 is the best class prediction for a level. When no class is present, instead of Tuple it will be None for that level. """ - self._molfam_classes_names = self._spectra_classes_names - self._molfam_classes_names_inds = self._spectra_classes_names_inds - molfam_classes = {} + self._mf_classes_names = self._spectra_classes_names + self._mf_classes_names_inds = self._spectra_classes_names_inds + mf_classes = {} - for molfam in molfams: - fid = molfam.id # the key - spectra = molfam.spectra + for mf in mfs: + fid = mf.id # the key + spectra = mf.spectra # if singleton family, format like 'fid_spectrum-id' if fid.startswith("singleton-"): spec_id = spectra[0].id fid += f"_{spec_id}" - len_molfam = len(spectra) + len_mf = len(spectra) classes_per_spectra = [] for spec in spectra: @@ -411,10 +409,10 @@ class prediction for a level. When no class is present, instead of Tuple it will classes_per_spectra.append(spec_classes) if not classes_per_spectra: - continue # no spectra with classes for this molfam + continue # no spectra with classes for this mf sorted_classes = [] - for i, class_level in enumerate(self._molfam_classes_names): + for i, class_level in enumerate(self._mf_classes_names): # 1. aggregate classes from all spectra for this class level classes_cur_level = [] for spec_classes in classes_per_spectra: @@ -423,7 +421,7 @@ class prediction for a level. When no class is present, instead of Tuple it will if class_tup: classes_cur_level.append(class_tup[0]) except IndexError: - print(self._molfam_classes_names) + print(self._mf_classes_names) print(i, class_level) print(classes_per_spectra) print(spec_classes) @@ -433,9 +431,9 @@ class prediction for a level. When no class is present, instead of Tuple it will # 3. calculate fraction and sort high to low, filter out Nones fraction_tups = sorted( ( - (cls, count / len_molfam) + (cls, count / len_mf) for cls, count in counts_cur_level.most_common() - if count / len_molfam >= fraction_cutoff + if count / len_mf >= fraction_cutoff ), key=lambda x: x[1], reverse=True, @@ -443,10 +441,10 @@ class prediction for a level. When no class is present, instead of Tuple it will if not fraction_tups: fraction_tups = [None] sorted_classes.append(fraction_tups) - molfam_classes[fid] = sorted_classes + mf_classes[fid] = sorted_classes - self._molfam_classes = molfam_classes - return molfam_classes + self._mf_classes = mf_classes + return mf_classes def show(self, objects): """Show a table of predicted chemical compound classes for spectrum/MF. @@ -471,16 +469,16 @@ def spectra_classes_names_inds(self): return self._spectra_classes_names_inds @property - def molfam_classes(self): - return self._molfam_classes + def mf_classes(self): + return self._mf_classes @property - def molfam_classes_names(self): - return self._molfam_classes_names + def mf_classes_names(self): + return self._mf_classes_names @property - def molfam_classes_names_inds(self): - return self._molfam_classes_names_inds + def mf_classes_names_inds(self): + return self._mf_classes_names_inds class MolNetEnhancerResults: @@ -496,9 +494,9 @@ def __init__(self, mne_dir): Args: mne_dir: str, mne_dir found in root_dir of nplinker project """ - cf_classes_names, molfam_classes, spectra2molfam = self._read_cf_classes(mne_dir) - self._spectra2molfam = spectra2molfam - self._molfam_classes = molfam_classes + cf_classes_names, mf_classes, spectra2mf = self._read_cf_classes(mne_dir) + self._spectra2mf = spectra2mf + self._mf_classes = mf_classes self._spectra_classes_names = cf_classes_names # if NPC gets implemented, add here self._spectra_classes_names_inds = {elem: i for i, elem in enumerate(cf_classes_names)} @@ -510,9 +508,9 @@ def _read_cf_classes(self, mne_dir): Returns: tuple of: -list of str - names of the classes in order - -dict of {str: [(str, float)]} - linking molfams to (classes, scores) in order of names, + -dict of {str: [(str, float)]} - linking mfs to (classes, scores) in order of names, singleton families are denoted with S[\d]+ - -dict of {str:str} - linking spectra to molfams + -dict of {str:str} - linking spectra to mfs """ columns = [] mne_component_dict = {} @@ -579,7 +577,7 @@ def _read_cf_classes(self, mne_dir): return columns, mne_component_dict, mne_cluster2component def spectra_classes(self, spectrum_id): - """Return classes by relating spectrum_id in the molfam_classes. + """Return classes by relating spectrum_id in the mf_classes. Args: spectrum_id: int/str, spectrum_id - ints will be converted to str @@ -587,14 +585,14 @@ def spectra_classes(self, spectrum_id): classes = [] if isinstance(spectrum_id, int): spectrum_id = str(spectrum_id) - molfam_id = self.spectra2molfam.get(spectrum_id) - if molfam_id: - classes = self.molfam_classes.get(molfam_id) + mf_id = self.spectra2mf.get(spectrum_id) + if mf_id: + classes = self.mf_classes.get(mf_id) return classes @property - def spectra2molfam(self): - return self._spectra2molfam + def spectra2mf(self): + return self._spectra2mf @property def spectra_classes_names(self): @@ -605,5 +603,5 @@ def spectra_classes_names_inds(self): return self._spectra_classes_names_inds @property - def molfam_classes(self): - return self._molfam_classes + def mf_classes(self): + return self._mf_classes diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 1b332985f..cfa7fe363 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -35,7 +35,7 @@ class DatasetLoader: bgcs: A list of BGC objects. gcfs: A list of GCF objects. spectra: A list of Spectrum objects. - molfams: A list of MolecularFamily objects. + mfs: A list of MolecularFamily objects. mibig_bgcs: A list of MIBiG BGC objects. mibig_strains_in_use: A StrainCollection object that contains the strains in use from MIBiG. product_types: A list of product types. @@ -60,7 +60,7 @@ def __init__(self, config: Dynaconf): """ self.config = config - self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], [] + self.bgcs, self.gcfs, self.spectra, self.mfs = [], [], [], [] self.mibig_bgcs = [] self.mibig_strains_in_use = StrainCollection() self.product_types = [] @@ -114,7 +114,7 @@ def _load_metabolomics(self): objects added (i.e. `Spectrum.strains` updated). If a Spectrum object does not have Strain objects, it is not added to `self.spectra`. - The attribute of `self.molfams` is set to the loaded MolecularFamily objects that have + The attribute of `self.mfs` is set to the loaded MolecularFamily objects that have Strain objects added (i.e. `MolecularFamily._strains` updated). This means only Spectra objects with updated strains (i.e. `self.spectra`) can be added to MolecularFamily objects. """ @@ -129,7 +129,7 @@ def _load_metabolomics(self): gnps_dir / defaults.GNPS_ANNOTATIONS_FILENAME ).annotations # Step 3: load all MolecularFamily objects - raw_molfams = GNPSMolecularFamilyLoader( + raw_mfs = GNPSMolecularFamilyLoader( gnps_dir / defaults.GNPS_MOLECULAR_FAMILY_FILENAME ).get_mfs(keep_singleton=False) @@ -139,11 +139,11 @@ def _load_metabolomics(self): spectra_with_strains, _ = add_strains_to_spectrum(self.strains, raw_spectra) # Step 6: add Spectrum objects to MolecularFamily - mf_with_spec, _, _ = add_spectrum_to_mf(spectra_with_strains, raw_molfams) + mf_with_spec, _, _ = add_spectrum_to_mf(spectra_with_strains, raw_mfs) - # Step 7: set attributes of self.spectra and self.molfams with valid objects + # Step 7: set attributes of self.spectra and self.mfs with valid objects self.spectra = spectra_with_strains - self.molfams = mf_with_spec + self.mfs = mf_with_spec logger.info("Loading metabolomics data completed\n") return True @@ -266,10 +266,10 @@ def _load_class_info(self): # load Chem_class_predictions (canopus, molnetenhancer are loaded) chem_classes = ChemClassPredictions(self.canopus_dir, self.molnetenhancer_dir, self._root) # noqa - # if no molfam classes transfer them from spectra (due to old style MN) - if not chem_classes.canopus.molfam_classes and chem_classes.canopus.spectra_classes: + # if no mf classes transfer them from spectra (due to old style MN) + if not chem_classes.canopus.mf_classes and chem_classes.canopus.spectra_classes: logger.info("Added chemical compound classes for MFs") - chem_classes.canopus.transfer_spec_classes_to_molfams(self.molfams) + chem_classes.canopus.transfer_spec_classes_to_mfs(self.mfs) # include them in loader self.chem_classes = chem_classes return True diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index 0dff285e0..2a1059a8e 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -61,7 +61,7 @@ def __init__(self, config_file: str | PathLike): self._gcfs = [] self._strains = None self._metadata = {} - self._molfams = [] + self._mfs = [] self._mibig_bgcs = [] self._chem_classes = None self._class_matches = None @@ -147,7 +147,7 @@ def load_data(self): loader.load() self._spectra = loader.spectra - self._molfams = loader.molfams + self._mfs = loader.mfs self._bgcs = loader.bgcs self._gcfs = loader.gcfs self._mibig_bgcs = loader.mibig_bgcs @@ -160,7 +160,7 @@ def load_data(self): def get_links( self, input_objects: list, scoring_methods: list, and_mode: bool = True ) -> LinkCollection: - """Find links for a set of input objects (BGCs/GCFs/Spectra/MolFams). + """Find links for a set of input objects (BGCs/GCFs/Spectra/mfs). The input objects can be any mix of the following NPLinker types: @@ -303,9 +303,9 @@ def spectra(self): return self._spectra @property - def molfams(self): + def mfs(self): """Returns a list of all the MolecularFamilies in the dataset.""" - return self._molfams + return self._mfs @property def metadata(self): diff --git a/src/nplinker/pickler.py b/src/nplinker/pickler.py index c80866a50..17eac8890 100644 --- a/src/nplinker/pickler.py +++ b/src/nplinker/pickler.py @@ -75,7 +75,7 @@ def persistent_load(self, pid): elif obj_type == "Spectrum": return self.nplinker.spectra[obj_id] elif obj_type == "MolecularFamily": - return self.nplinker.molfams[obj_id] + return self.nplinker.mfs[obj_id] elif obj_type == "ScoringMethod": return self.nplinker.scoring_method(obj_id) diff --git a/src/nplinker/scoring/metcalf_scoring.py b/src/nplinker/scoring/metcalf_scoring.py index e1a090168..993362710 100644 --- a/src/nplinker/scoring/metcalf_scoring.py +++ b/src/nplinker/scoring/metcalf_scoring.py @@ -102,14 +102,14 @@ def setup(cls, npl: NPLinker): logger.info( f"MetcalfScoring.setup starts: #bgcs={len(npl.bgcs)}, #gcfs={len(npl.gcfs)}, " - f"#spectra={len(npl.spectra)}, #molfams={len(npl.molfams)}, #strains={npl.strains}" + f"#spectra={len(npl.spectra)}, #mfs={len(npl.mfs)}, #strains={npl.strains}" ) cls.npl = npl # calculate presence of gcfs/spectra/mfs with respect to strains cls.presence_gcf_strain = get_presence_gcf_strain(npl.gcfs, npl.strains) cls.presence_spec_strain = get_presence_spec_strain(npl.spectra, npl.strains) - cls.presence_mf_strain = get_presence_mf_strain(npl.molfams, npl.strains) + cls.presence_mf_strain = get_presence_mf_strain(npl.mfs, npl.strains) # calculate raw Metcalf scores for spec-gcf links raw_score_spec_gcf = cls._calc_raw_score( diff --git a/src/nplinker/scoring/np_class_scoring.py b/src/nplinker/scoring/np_class_scoring.py index 25479512e..5e2704880 100644 --- a/src/nplinker/scoring/np_class_scoring.py +++ b/src/nplinker/scoring/np_class_scoring.py @@ -44,7 +44,7 @@ def _npclass_score(self, obj, target, method="mix", obj_classes=None, target_cla target (if obj is BGC then obj_classes should be classes for the BGC). Can be: - BGC/GCF classes: {'as_classes': list(str), 'bigscape_class': str} - - Spectrum/Molfam classes: tuple of (classes, class_names_indices), + - Spectrum/mf classes: tuple of (classes, class_names_indices), where classes is a list of list of tuples/None, where each tuple is a class and a score (str, float), and class_names_indices is a list of ints that relate to the name of a class ontology lvl @@ -157,7 +157,7 @@ def _get_targets(self, test_id): """Get the targets based upon instance of test_id, returns list of targets. Args: - test_id: one of the NPLinker objects: BGC, GCF, Spectrum, Molfam + test_id: one of the NPLinker objects: BGC, GCF, Spectrum, mf Returns: List of one or more of one of the NPLinker objects """ @@ -174,7 +174,7 @@ def _get_targets_metabolomics(self, test_id): targets = self.npl.bgcs else: targets = self.npl.gcfs - else: # obj are molfam + else: # obj are mf if self.equal_targets: targets = self.npl.gcfs else: @@ -183,15 +183,15 @@ def _get_targets_metabolomics(self, test_id): def _get_targets_genomics(self, test_id): if self.both_targets: # no matter BGC or GCF take both spec and MF - targets = self.npl.spectra + self.npl.molfams + targets = self.npl.spectra + self.npl.mfs elif isinstance(test_id, BGC): # obj are BGC if self.equal_targets: # take targets = self.npl.spectra else: - targets = self.npl.molfams + targets = self.npl.mfs else: # obj are GCF if self.equal_targets: - targets = self.npl.molfams + targets = self.npl.mfs else: targets = self.npl.spectra return targets @@ -236,10 +236,10 @@ def _get_gen_classes(self, bgc_like, gcf_as_cutoff=0.5): return bgc_like_classes_dict def _get_met_classes(self, spec_like, method="mix"): - """Get chemical classes for a Spectrum or MolFam based on method. + """Get chemical classes for a Spectrum or mf based on method. Args: - spec_like: Spectrum or MolFam, one of the NPLinker input types + spec_like: Spectrum or mf, one of the NPLinker input types method: str, one of the appropriate methods for chemical class predictions (mix, canopus...), default='mix' Returns: @@ -248,7 +248,7 @@ def _get_met_classes(self, spec_like, method="mix"): tuple is a class and a score (str, float), and class_names_indices is a list of ints that relate to the name of a class ontology lvl """ - # assess if spectrum or molfam + # assess if spectrum or mf is_spectrum = isinstance(spec_like, Spectrum) # gather classes for spectra, using right method @@ -280,11 +280,11 @@ def _get_met_classes(self, spec_like, method="mix"): spec_like_classes_names_inds = ( self.npl.chem_classes.canopus.spectra_classes_names_inds ) - else: # molfam + else: # mf fam_id = spec_like.family.id if fam_id.startswith("singleton-"): # account for singleton families fam_id += f"_{spec_like.spectra[0].id}" - all_classes = self.npl.chem_classes.canopus.molfam_classes.get(fam_id) + all_classes = self.npl.chem_classes.canopus.mf_classes.get(fam_id) if all_classes: spec_like_classes = [ cls_per_lvl @@ -292,21 +292,19 @@ def _get_met_classes(self, spec_like, method="mix"): for i, cls_per_lvl in enumerate(lvl) if i == 0 ] - spec_like_classes_names_inds = ( - self.npl.chem_classes.canopus.molfam_classes_names_inds - ) + spec_like_classes_names_inds = self.npl.chem_classes.canopus.mf_classes_names_inds if use_mne and not spec_like_classes: # if mne or when main/canopus does not get classes if is_spectrum: spec_like_classes = self.npl.chem_classes.molnetenhancer.spectra_classes( spec_like.id ) - else: # molfam + else: # mf fam_id = spec_like.family.id if fam_id.startswith("singleton"): # account for singleton families fam_id += f"_{spec_like.spectra[0].id}" - spec_like_classes = self.npl.chem_classes.molnetenhancer.molfam_classes.get(fam_id) - # classes are same for molfam and spectrum so names are irrespective of is_spectrum + spec_like_classes = self.npl.chem_classes.molnetenhancer.mf_classes.get(fam_id) + # classes are same for mf and spectrum so names are irrespective of is_spectrum spec_like_classes_names_inds = ( self.npl.chem_classes.molnetenhancer.spectra_classes_names_inds ) diff --git a/tests/integration/test_nplinker_local.py b/tests/integration/test_nplinker_local.py index 29bf48f2e..ea19b43ae 100644 --- a/tests/integration/test_nplinker_local.py +++ b/tests/integration/test_nplinker_local.py @@ -61,5 +61,5 @@ def test_load_data(npl: NPLinker): assert len(npl.bgcs) == 390 assert len(npl.gcfs) == 64 assert len(npl.spectra) == 24652 - assert len(npl.molfams) == 29 + assert len(npl.mfs) == 29 assert len(npl.strains) == 46 diff --git a/tests/unit/class_info/test_chem_classes.py b/tests/unit/class_info/test_chem_classes.py index b817ea045..560639611 100644 --- a/tests/unit/class_info/test_chem_classes.py +++ b/tests/unit/class_info/test_chem_classes.py @@ -13,9 +13,9 @@ # def test_running(self): # for i, elem in enumerate([ # self._cr.spectra_classes, self._cr.spectra_classes_names, -# self._cr.spectra_classes_names_inds, self._cr.molfam_classes, -# self._cr.molfam_classes_names, -# self._cr.molfam_classes_names_inds]): +# self._cr.spectra_classes_names_inds, self._cr.mf_classes, +# self._cr.mf_classes_names, +# self._cr.mf_classes_names_inds]): # self.assertTrue(len(elem) != 0, f"Element {i} failed to load") # def test_can_treemap(self): @@ -32,10 +32,10 @@ def setUp(self): def test_running(self): for i, elem in enumerate( [ - self._mr.spectra2molfam, + self._mr.spectra2mf, self._mr.spectra_classes_names, self._mr.spectra_classes_names_inds, - self._mr.molfam_classes, + self._mr.mf_classes, ] ): self.assertTrue(len(elem) != 0, f"Element {i} failed to load") diff --git a/tests/unit/scoring/conftest.py b/tests/unit/scoring/conftest.py index a39381ee0..8c7ac1f5e 100644 --- a/tests/unit/scoring/conftest.py +++ b/tests/unit/scoring/conftest.py @@ -75,7 +75,7 @@ def npl(gcfs, spectra, mfs, strains, tmp_path) -> NPLinker: npl = NPLinker(CONFIG_FILE_LOCAL_MODE) npl._gcfs = gcfs npl._spectra = spectra - npl._molfams = mfs + npl._mfs = mfs npl._strains = strains npl._gcf_lookup = {gcf.id: gcf for gcf in gcfs} npl._mf_lookup = {mf.id: mf for mf in mfs}