Skip to content

Commit

Permalink
small reformat
Browse files Browse the repository at this point in the history
Signed-off-by: Radu Muntean <[email protected]>
  • Loading branch information
heracle committed Aug 5, 2021
1 parent 76dcb32 commit db97283
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 147 deletions.
95 changes: 0 additions & 95 deletions metagraph/src/annotation/taxonomy/label_to_taxid.cpp

This file was deleted.

145 changes: 117 additions & 28 deletions metagraph/src/annotation/taxonomy/tax_classifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "annotation/representation/annotation_matrix/annotation_matrix.hpp"
#include "common/unix_tools.hpp"
#include "common/utils/string_utils.hpp"

#include "common/logger.hpp"

Expand All @@ -14,68 +15,156 @@ namespace annot {

using mtg::common::logger;

bool TaxonomyBase::assign_label_type(const std::string &sample_label) {
if (utils::starts_with(sample_label, "gi|")) {
// e.g. >gi|1070643132|ref|NC_031224.1| Arthrobacter phage Mudcat, complete genome
label_type = GEN_BANK;
return true;
} else if (utils::starts_with(utils::split_string(sample_label, ":")[1], "taxid|")) {
// e.g. >kraken:taxid|2016032|NC_047834.1 Alteromonas virus vB_AspP-H4/4, complete genome
label_type = TAXID;
return false;
}

logger->error("Can't determine the type of the given label {}. "
"Make sure the labels are in a recognized format.", sample_label);
exit(1);
}

bool TaxonomyBase::get_taxid_from_label(const std::string &label, TaxId *taxid) const {
if (label_type == TAXID) {
*taxid = std::stoul(utils::split_string(label, "|")[1]);
return true;
} else if (TaxonomyBase::label_type == GEN_BANK) {
std::string acc_version = get_accession_version_from_label(label);
if (not accversion_to_taxid_map.count(acc_version)) {
return false;
}
*taxid = accversion_to_taxid_map.at(acc_version);
return true;
}

logger->error("Error: Could not get the taxid for label {}", label);
exit(1);
}

std::string TaxonomyBase::get_accession_version_from_label(const std::string &label) const {
if (label_type == TAXID) {
return utils::split_string(utils::split_string(label, "|")[2], " ")[0];
} else if (label_type == GEN_BANK) {
return utils::split_string(label, "|")[3];;
}

logger->error("Error: Could not get the accession version for label {}", label);
exit(1);
}

// TODO improve this by parsing the compressed ".gz" version (or use https://github.com/pmenzel/taxonomy-tools)
void TaxonomyBase::read_accversion_to_taxid_map(const std::string &filepath,
const graph::AnnotatedDBG *anno_matrix = NULL) {
std::ifstream f(filepath);
if (!f.good()) {
logger->error("Failed to open accession to taxid map table {}", filepath);
exit(1);
}

std::string line;
getline(f, line);
if (!utils::starts_with(line, "accession\taccession.version\ttaxid\t")) {
logger->error("The accession to taxid map table is not in the standard (*.accession2taxid) format {}",
filepath);
exit(1);
}

tsl::hopscotch_set<std::string> input_accessions;
if (anno_matrix != NULL) {
for (const std::string &label : anno_matrix->get_annotation().get_all_labels()) {
input_accessions.insert(get_accession_version_from_label(label));
}
}

while (getline(f, line)) {
if (line == "") {
logger->error("The accession to taxid map table contains empty lines. "
"Please make sure that this file was not manually modified {}", filepath);
exit(1);
}
std::vector<std::string> parts = utils::split_string(line, "\t");
if (parts.size() <= 2) {
logger->error("The accession to taxid map table contains incomplete lines. "
"Please make sure that this file was not manually modified {}", filepath);
exit(1);
}
if (input_accessions.size() == 0 || input_accessions.count(parts[1])) {
accversion_to_taxid_map[parts[1]] = std::stoul(parts[2]);
}
}
}

TaxonomyClsAnno::TaxonomyClsAnno(const graph::AnnotatedDBG &anno,
const double lca_coverage_rate,
const double kmers_discovery_rate,
const std::string &tax_tree_filepath,
const std::string &label_taxid_map_filepath) :
TaxonomyBase(lca_coverage_rate, kmers_discovery_rate), _anno_matrix(&anno) {
const std::string &label_taxid_map_filepath)
: TaxonomyBase(lca_coverage_rate, kmers_discovery_rate),
_anno_matrix(&anno) {
if (!std::filesystem::exists(tax_tree_filepath)) {
logger->error("Can't open taxonomic tree file {}.", tax_tree_filepath);
std::exit(1);
logger->error("Can't open taxonomic tree file {}", tax_tree_filepath);
exit(1);
}

bool require_accversion_to_taxid_map = false;
assign_label_type(_anno_matrix->get_annotation().get_all_labels()[0], &require_accversion_to_taxid_map);
bool require_accversion_to_taxid_map = assign_label_type(_anno_matrix->get_annotation().get_all_labels()[0]);

Timer timer;
if (require_accversion_to_taxid_map) {
logger->trace("Parsing label_taxid_map file..");
logger->trace("Parsing label_taxid_map file...");
read_accversion_to_taxid_map(label_taxid_map_filepath, _anno_matrix);
logger->trace("Finished label_taxid_map file in {}s", timer.elapsed());
logger->trace("Finished label_taxid_map file in {} sec", timer.elapsed());
}

timer.reset();
logger->trace("Parsing taxonomic tree..");
logger->trace("Parsing taxonomic tree...");
ChildrenList tree;
read_tree(tax_tree_filepath, &tree);
logger->trace("Finished taxonomic tree read in {}s.", timer.elapsed());
logger->trace("Finished taxonomic tree read in {} sec.", timer.elapsed());

timer.reset();
logger->trace("Calculating tree statistics..");
logger->trace("Calculating tree statistics...");
std::vector<TaxId> tree_linearization;
dfs_statistics(root_node, tree, &tree_linearization);
logger->trace("Finished tree statistics calculation in {}s.", timer.elapsed());
logger->trace("Finished tree statistics calculation in {} sec.", timer.elapsed());

timer.reset();
logger->trace("Starting rmq preprocessing..");
logger->trace("Starting rmq preprocessing...");
rmq_preprocessing(tree_linearization);
logger->trace("Finished rmq preprocessing in {}s.", timer.elapsed());
logger->trace("Finished rmq preprocessing in {} sec.", timer.elapsed());
}

void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath, ChildrenList *tree) {
std::ifstream f(tax_tree_filepath);
if (!f.good()) {
logger->error("Failed to open Taxonomic Tree file {}.", tax_tree_filepath);
logger->error("Failed to open Taxonomic Tree file {}", tax_tree_filepath);
exit(1);
}

std::string line;
tsl::hopscotch_map<TaxId, TaxId> full_parents_list;
while (getline(f, line)) {
if (line == "") {
logger->error("The Taxonomic Tree file contains empty lines. Please make sure that this file was not manually modified: {}.",
logger->error("The Taxonomic Tree file contains empty lines. "
"Please make sure that this file was not manually modified: {}",
tax_tree_filepath);
exit(1);
}
std::vector<std::string> parts = utils::split_string(line, "\t");
if (parts.size() <= 2) {
logger->error("The Taxonomic tree filepath contains incomplete lines. Please make sure that this file was not manually modified: {}.",
logger->error("The Taxonomic tree filepath contains incomplete lines. "
"Please make sure that this file was not manually modified: {}",
tax_tree_filepath);
exit(1);
}
uint32_t act = static_cast<uint32_t>(std::stoull(parts[0]));
uint32_t parent = static_cast<uint32_t>(std::stoull(parts[2]));
uint32_t act = std::stoul(parts[0]);
uint32_t parent = std::stoul(parts[2]);
full_parents_list[act] = parent;
node_parent[act] = parent;
}
Expand Down Expand Up @@ -118,7 +207,7 @@ void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath, ChildrenLi
}
}
if (num_taxid_failed) {
logger->warn("During the tax_tree_filepath {} parsing, {} taxids were not found out of {} evaluations.",
logger->warn("During the tax_tree_filepath {} parsing, {} taxids were not found out of {} total evaluations.",
tax_tree_filepath, num_taxid_failed, relevant_taxids.size());
}

Expand Down Expand Up @@ -181,24 +270,24 @@ void TaxonomyClsAnno::rmq_preprocessing(const std::vector<TaxId> &tree_lineariza

std::vector<TaxId> TaxonomyClsAnno::get_lca_taxids_for_seq(const std::string_view &sequence, bool reversed) const {
cerr << "Assign class not implemented reversed = " << reversed << "\n";
throw std::runtime_error("get_lca_taxids_for_seq TaxonomyClsAnno not implemented. Received seq size" + to_string(sequence.size()));
exit(0);
throw std::runtime_error("get_lca_taxids_for_seq TaxonomyClsAnno not implemented. Received seq size"
+ to_string(sequence.size()));
}

std::vector<TaxId> TaxonomyClsImportDB::get_lca_taxids_for_seq(const std::string_view &sequence, bool reversed) const {
cerr << "Assign class not implemented reversed = " << reversed << "\n";
throw std::runtime_error("get_lca_taxids_for_seq TaxonomyClsImportDB not implemented. Received seq size" + to_string(sequence.size()));
exit(0);
throw std::runtime_error("get_lca_taxids_for_seq TaxonomyClsImportDB not implemented. Received seq size"
+ to_string(sequence.size()));
}

TaxId TaxonomyClsAnno::find_lca(const std::vector<TaxId> &taxids) const {
throw std::runtime_error("find_lca TaxonomyClsAnno not implemented. Received taxids size" + to_string(taxids.size()));
exit(0);
throw std::runtime_error("find_lca TaxonomyClsAnno not implemented. Received taxids size"
+ to_string(taxids.size()));
}

TaxId TaxonomyClsImportDB::find_lca(const std::vector<TaxId> &taxids) const {
throw std::runtime_error("find_lca TaxonomyClsImportDB not implemented. Received taxids size" + to_string(taxids.size()));
exit(0);
throw std::runtime_error("find_lca TaxonomyClsImportDB not implemented. Received taxids size"
+ to_string(taxids.size()));
}

} // namespace annot
Expand Down
Loading

0 comments on commit db97283

Please sign in to comment.