Skip to content

Commit

Permalink
taxonomy: handling shortened latin names and any kind of lowercase …
Browse files Browse the repository at this point in the history
…name
  • Loading branch information
deeenes committed Nov 28, 2022
1 parent 51b2ac0 commit 32ddda3
Showing 1 changed file with 74 additions and 36 deletions.
110 changes: 74 additions & 36 deletions pypath/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,9 +188,30 @@
}


dbptm_to_ncbi_tax_id = common.swap_dict_simple(dbptm_taxids)
latin_name_to_ncbi_tax_id = common.swap_dict_simple(phosphoelm_taxids)
ensembl_name_to_ncbi_tax_id = common.swap_dict_simple(ensembl_taxids)
def shorten_latin_name(name: str, dot: bool = True) -> str:
"""
For a complete latin name, returns its shortened version.
In short latin names the genus name is marked only by its initial.
"""

if name:

name = name.split()

return f'{name[0][0].upper()}{"." if dot else ""} {"".join(name[1:])}'


def short_latin_names(long_names: dict[str, int]) -> dict[str, int]:
"""
For a dict of long latin names returns a dict with all names shortened.
"""

return {
shorten_latin_name(k, dot = dot): v
for k, v in long_names.items()
for dot in (True, False)
}


def ensure_common_name(taxon_id):
Expand Down Expand Up @@ -238,31 +259,31 @@ def taxid_from_common_name(taxon_name):
taxon_name = taxon_name.strip()
taxon_name_l = taxon_name.lower()
taxon_name_c = taxon_name.capitalize()

if (
taxon_name is None or
not taxon_name_l or
taxon_name in {'none', 'unknown'}
):

return None

if taxon_name_l in taxa_synonyms:

return taxid_from_common_name(taxa_synonyms[taxon_name_l])

if taxon_name_l in taxa:

return taxa[taxon_name_l]

if taxon_name_l in taxa2:

return taxa2[taxon_name_l]

common_to_ncbi = get_db('common')

if taxon_name in common_to_ncbi:

return common_to_ncbi[taxon_name]

if taxon_name_c in common_to_ncbi:
Expand All @@ -271,29 +292,33 @@ def taxid_from_common_name(taxon_name):


def taxid_from_latin_name(taxon_name):

if taxon_name in latin_name_to_ncbi_tax_id:

return latin_name_to_ncbi_tax_id[taxon_name]


if taxon_name in short_latin_name_to_ncbi_tax_id:

return short_latin_name_to_ncbi_tax_id[taxon_name]

latin_to_ncbi = get_db('latin')

if taxon_name in latin_to_ncbi:

return latin_to_ncbi[taxon_name]


def taxid_from_dbptm_taxon_name(taxon_name):

if taxon_name in dbptm_to_ncbi_tax_id:

return dbptm_to_ncbi_tax_id[taxon_name]


def taxid_from_nonstandard(taxon_name):

if taxon_name in nonstandard_taxids:

return nonstandard_taxids[taxon_name]


Expand All @@ -310,44 +335,44 @@ def ensure_ncbi_tax_id(taxon_id):
Handles English names, scientific names and other common language
synonyms and database specific codenames.
"""

if isinstance(taxon_id, int):

return taxon_id

else:

if hasattr(taxon_id, 'strip'):

taxon_id = taxon_id.strip()

if common.is_str(taxon_id) and '(' in taxon_id:

part0, part1 = taxon_id.split('(', maxsplit = 1)

ncbi_tax_id = (
ensure_ncbi_tax_id(part0) or
ensure_ncbi_tax_id(part1.split(')', maxsplit = 1)[0])
)

elif hasattr(taxon_id, 'isdigit') and taxon_id.isdigit():

ncbi_tax_id = int(taxon_id)

else:

ncbi_tax_id = (
taxid_from_dbptm_taxon_name(taxon_id) or
taxid_from_nonstandard(taxon_id) or
taxid_from_common_name(taxon_id) or
taxid_from_latin_name(taxon_id) or
taxid_from_ensembl_name(taxon_id)
)

if not ncbi_tax_id:

_log('Could not map to NCBI Taxonomy ID: `%s`.' % str(taxon_id))

return ncbi_tax_id


Expand All @@ -363,6 +388,11 @@ def uniprot_taxid(uniprot):
return uniprot_to_taxid[uniprot]


dbptm_to_ncbi_tax_id = common.swap_dict_simple(dbptm_taxids)
latin_name_to_ncbi_tax_id = common.swap_dict_simple(phosphoelm_taxids)
short_latin_name_to_ncbi_tax_id = short_latin_names(latin_name_to_ncbi_tax_id)
ensembl_name_to_ncbi_tax_id = common.swap_dict_simple(ensembl_taxids)

_cleanup_timeloop = timeloop.Timeloop()
_cleanup_timeloop.logger.setLevel(9999)

Expand Down Expand Up @@ -437,6 +467,10 @@ def init_db(key):
for taxon in ncbi_data.values()
)

if not swap:

this_db.update(short_latin_names(this_db))

elif _key == 'common':

this_db = (
Expand Down Expand Up @@ -485,6 +519,10 @@ def init_db(key):

this_db = common.swap_dict_simple(this_db)

else:

this_db.update({k.lower(): v for k, v in this_db.items()})

if this_db:

globals()['db'][key] = this_db
Expand Down

0 comments on commit 32ddda3

Please sign in to comment.