Skip to content

Commit

Permalink
address comments
Browse files Browse the repository at this point in the history
Signed-off-by: Radu Muntean <[email protected]>
  • Loading branch information
heracle committed Jul 30, 2021
1 parent ebfc931 commit 805735c
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 66 deletions.
23 changes: 12 additions & 11 deletions metagraph/src/annotation/taxonomy/label_to_taxid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@ namespace annot {
using mtg::common::logger;

void TaxonomyBase::assign_label_type(const std::string &label, bool *require_accversion_to_taxid_map) {
if (!utils::starts_with(label, ">gi|")) {
if (utils::starts_with(label, ">gi|")) {
// e.g. >gi|1070643132|ref|NC_031224.1| Arthrobacter phage Mudcat, complete genome
this->label_type = GEN_BANK;
label_type = GEN_BANK;
*require_accversion_to_taxid_map = true;
} else if (!utils::starts_with(label, ">kraken:")) {
} else if (utils::starts_with(label, ">") &&
utils::starts_with(utils::split_string(label, ":")[1], "taxid|")) {
// e.g. >kraken:taxid|2016032|NC_047834.1 Alteromonas virus vB_AspP-H4/4, complete genome
this->label_type = KRAKEN;
label_type = TAXID;
*require_accversion_to_taxid_map = false;
} else {
logger->error("Can't determine the type of the given label {}. Please make sure that the labels are in a recognized format.", label);
Expand All @@ -25,15 +26,15 @@ void TaxonomyBase::assign_label_type(const std::string &label, bool *require_acc
}

bool TaxonomyBase::get_taxid_from_label(const std::string &label, TaxId *taxid) const {
if (this->label_type == KRAKEN) {
if (label_type == TAXID) {
*taxid = static_cast<uint32_t>(std::stoull(utils::split_string(label, "|")[1]));
return true;
} else if (TaxonomyBase::label_type == GEN_BANK) {
std::string acc_version = this->get_accession_version_from_label(label);
if (not this->accversion_to_taxid_map.count(acc_version)) {
std::string acc_version = get_accession_version_from_label(label);
if (not accversion_to_taxid_map.count(acc_version)) {
return false;
}
*taxid = this->accversion_to_taxid_map.at(acc_version);
*taxid = accversion_to_taxid_map.at(acc_version);
return true;
}

Expand All @@ -42,9 +43,9 @@ bool TaxonomyBase::get_taxid_from_label(const std::string &label, TaxId *taxid)
}

std::string TaxonomyBase::get_accession_version_from_label(const std::string &label) const {
if (this->label_type == KRAKEN) {
if (label_type == TAXID) {
return utils::split_string(utils::split_string(label, "|")[2], " ")[0];
} else if (this->label_type == GEN_BANK) {
} else if (label_type == GEN_BANK) {
return utils::split_string(label, "|")[3];;
}

Expand Down Expand Up @@ -86,7 +87,7 @@ void TaxonomyBase::read_accversion_to_taxid_map(const std::string &filepath,
exit(1);
}
if (input_accessions.size() == 0 || input_accessions.count(parts[1])) {
this->accversion_to_taxid_map[parts[1]] = static_cast<TaxId>(std::stoull(parts[2]));
accversion_to_taxid_map[parts[1]] = static_cast<TaxId>(std::stoull(parts[2]));
}
}
}
Expand Down
54 changes: 20 additions & 34 deletions metagraph/src/annotation/taxonomy/tax_classifier.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,8 @@ TaxonomyClsAnno::TaxonomyClsAnno(const graph::AnnotatedDBG &anno,
const double lca_coverage_rate,
const double kmers_discovery_rate,
const std::string &tax_tree_filepath,
const std::string &label_taxid_map_filepath) : _anno_matrix(&anno) {
_lca_coverage_rate = lca_coverage_rate;
_kmers_discovery_rate = kmers_discovery_rate;

const std::string &label_taxid_map_filepath) :
TaxonomyBase(lca_coverage_rate, kmers_discovery_rate), _anno_matrix(&anno) {
if (!std::filesystem::exists(tax_tree_filepath)) {
logger->error("Can't open taxonomic tree file {}.", tax_tree_filepath);
std::exit(1);
Expand Down Expand Up @@ -55,8 +53,7 @@ TaxonomyClsAnno::TaxonomyClsAnno(const graph::AnnotatedDBG &anno,
logger->trace("Finished rmq preprocessing in {}s.", timer.elapsed());
}

void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath,
ChildrenList *tree) {
void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath, ChildrenList *tree) {
std::ifstream f(tax_tree_filepath);
if (!f.good()) {
logger->error("Failed to open Taxonomic Tree file {}.", tax_tree_filepath);
Expand All @@ -80,16 +77,16 @@ void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath,
uint32_t act = static_cast<uint32_t>(std::stoull(parts[0]));
uint32_t parent = static_cast<uint32_t>(std::stoull(parts[2]));
full_parents_list[act] = parent;
this->node_parent[act] = parent;
node_parent[act] = parent;
}

std::vector<TaxId> relevant_taxids;
// 'considered_relevant_taxids' is used to make sure that there are no duplications in 'relevant_taxids'.
tsl::hopscotch_set<TaxId> considered_relevant_taxids;

if (this->accversion_to_taxid_map.size()) {
if (accversion_to_taxid_map.size()) {
// Store only the taxonomic nodes that exists in the annotation matrix.
for (const pair<std::string, TaxId> &it : this->accversion_to_taxid_map) {
for (const pair<std::string, TaxId> &it : accversion_to_taxid_map) {
relevant_taxids.push_back(it.second);
considered_relevant_taxids.insert(it.second);
}
Expand Down Expand Up @@ -117,7 +114,7 @@ void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath,

// Check if the current taxid is the root.
if (taxid == full_parents_list[taxid]) {
this->root_node = taxid;
root_node = taxid;
}
}
if (num_taxid_failed) {
Expand All @@ -127,7 +124,7 @@ void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath,

// Construct the output tree.
for (const TaxId &taxid : relevant_taxids) {
if (taxid == this->root_node) {
if (taxid == root_node) {
continue;
}
(*tree)[full_parents_list[taxid]].push_back(taxid);
Expand All @@ -137,30 +134,30 @@ void TaxonomyClsAnno::read_tree(const std::string &tax_tree_filepath,
void TaxonomyClsAnno::dfs_statistics(const TaxId node,
const ChildrenList &tree,
std::vector<TaxId> *tree_linearization) {
this->node_to_linearization_idx[node] = tree_linearization->size();
node_to_linearization_idx[node] = tree_linearization->size();
tree_linearization->push_back(node);
uint32_t depth = 0;
for (const TaxId &child : tree.at(node)) {
dfs_statistics(child, tree, tree_linearization);
tree_linearization->push_back(node);
if (this->node_depth[child] > depth) {
depth = this->node_depth[child];
if (node_depth[child] > depth) {
depth = node_depth[child];
}
}
this->node_depth[node] = depth + 1;
node_depth[node] = depth + 1;
}

void TaxonomyClsAnno::rmq_preprocessing(const std::vector<TaxId> &tree_linearization) {
uint32_t num_rmq_rows = log2(tree_linearization.size()) + 1;

this->rmq_data.resize(num_rmq_rows);
rmq_data.resize(num_rmq_rows);
for (uint32_t i = 0; i < num_rmq_rows; ++i) {
this->rmq_data[i].resize(tree_linearization.size());
rmq_data[i].resize(tree_linearization.size());
}

// Copy tree_linearization to rmq[0].
for (uint32_t i = 0; i < tree_linearization.size(); ++i) {
this->rmq_data[0][i] = tree_linearization[i];
rmq_data[0][i] = tree_linearization[i];
}

// Delta represents the size of the RMQ's sliding window (always a power of 2).
Expand All @@ -169,30 +166,19 @@ void TaxonomyClsAnno::rmq_preprocessing(const std::vector<TaxId> &tree_lineariza
for (uint32_t i = 0; i + delta < tree_linearization.size(); ++i) {
// rmq_data[row][i] covers an interval of size delta=2^row and returns the node with the maximal depth among positions [i, i+2^row-1] in the linearization.
// According to 'this->dfs_statistics()': node_depth[leaf] = 1 and node_depth[root] = maximum distance to a leaf.
if (this->node_depth[this->rmq_data[row - 1][i]] >
this->node_depth[this->rmq_data[row - 1][i + delta]]) {
this->rmq_data[row][i] = this->rmq_data[row - 1][i];
if (node_depth[rmq_data[row - 1][i]] >
node_depth[rmq_data[row - 1][i + delta]]) {
rmq_data[row][i] = rmq_data[row - 1][i];
} else {
this->rmq_data[row][i] = this->rmq_data[row - 1][i + delta];
rmq_data[row][i] = rmq_data[row - 1][i + delta];
}
}
delta *= 2;
}

// Compute fast tables for log2 and pow2.
this->fast_log2.resize(tree_linearization.size());
this->fast_pow2.push_back(1);
for (uint32_t i = 2; i < tree_linearization.size(); ++i) {
this->fast_log2[i] = 1 + this->fast_log2[i/2];
if (this->fast_log2[i] > this->fast_log2[i-1]) {
this->fast_pow2.push_back(i);
}
}
}

TaxId TaxonomyClsAnno::assign_class(const std::string &sequence) const {
std::cerr << "assign class not implemented " << sequence << "\n\n";
return 0;
throw std::runtime_error("Assign class not implemented. Received " + sequence);
}

} // namespace annot
Expand Down
24 changes: 9 additions & 15 deletions metagraph/src/annotation/taxonomy/tax_classifier.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,15 @@ class TaxonomyBase {
using node_index = graph::SequenceGraph::node_index;

enum LabelType {
UNASSIGNED, GEN_BANK, KRAKEN
UNASSIGNED,
GEN_BANK, // e.g. ">gi|1070643132|ref|NC_031224.1| Arthrobacter phage Mudcat, complete genome"
TAXID, // e.g. ">kraken:taxid|2016032|NC_047834.1 Alteromonas virus vB_AspP-H4/4, complete genome"
};

TaxonomyBase() {};
TaxonomyBase(const double lca_coverage_rate, const double kmers_discovery_rate) :
_lca_coverage_rate(lca_coverage_rate), _kmers_discovery_rate(kmers_discovery_rate) {};

virtual ~TaxonomyBase() {};

// TODO implement
Expand All @@ -49,8 +55,7 @@ class TaxonomyBase {
* @param [input] filepath -> a ".accession2taxid" file.
* @param [input] anno_matrix -> pointer to the annotation matrix
*/
void read_accversion_to_taxid_map(const std::string &filepath,
const graph::AnnotatedDBG *anno_matrix);
void read_accversion_to_taxid_map(const std::string &filepath, const graph::AnnotatedDBG *anno_matrix);

// TODO implement.
/**
Expand Down Expand Up @@ -119,7 +124,6 @@ class TaxonomyClsAnno : public TaxonomyBase {
const std::string &tax_tree_filepath,
const std::string &label_taxid_map_filepath = "");
TaxonomyClsAnno() {};
~TaxonomyClsAnno() {};

// todo implement
void export_taxdb(const std::string &filepath) const;
Expand All @@ -138,7 +142,7 @@ class TaxonomyClsAnno : public TaxonomyBase {
ChildrenList *tree);

/**
* rmq_preprocessing computes 'this->rmq_data', 'this->precalc_log' and 'this->precalc_pow2' fields.
* rmq_preprocessing computes 'this->rmq_data' field.
*
* @param [input] tree_linearization -> the linearization of the taxonomic tree.
*/
Expand Down Expand Up @@ -172,16 +176,6 @@ class TaxonomyClsAnno : public TaxonomyBase {
*/
tsl::hopscotch_map<TaxId, uint32_t> node_to_linearization_idx;

/**
* fast_log2 is a table for a fast compute of log2(x).
*/
std::vector<uint32_t> fast_log2;

/**
* fast_pow2 is a table for a fast compute of pow2(x).
*/
std::vector<uint32_t> fast_pow2;

const graph::AnnotatedDBG *_anno_matrix = NULL;
};

Expand Down
6 changes: 0 additions & 6 deletions metagraph/tests/annotation/taxonomy/test_taxonomy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,6 @@ TEST (TaxonomyTest, RmqPreprocessing) {
std::vector<uint32_t> linearization = {
0, 1, 4, 7, 4, 8, 4, 1, 5, 1, 0, 2, 0, 3, 6, 3, 0
};
std::vector<uint32_t> expected_pow2 = {1, 2, 4, 8, 16};
std::vector<uint32_t> expected_log2 = {
0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4
};
std::vector<std::vector<uint32_t> > expected_rmq = {
{0, 1, 4, 7, 4, 8, 4, 1, 5, 1, 0, 2, 0, 3, 6, 3, 0},
{0, 1, 4, 4, 4, 4, 1, 1, 1, 0, 0, 0, 0, 3, 3, 0, 0},
Expand All @@ -90,8 +86,6 @@ TEST (TaxonomyTest, RmqPreprocessing) {
};

tax->rmq_preprocessing(linearization);
EXPECT_EQ(expected_pow2, tax->fast_pow2);
EXPECT_EQ(expected_log2, tax->fast_log2);
EXPECT_EQ(expected_rmq, tax->rmq_data);
}

Expand Down

0 comments on commit 805735c

Please sign in to comment.