Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support for multiple successors at forks #329

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
40bb74f
verbose logs
karasikov Jun 16, 2021
a2acd85
changes
karasikov Jun 16, 2021
e3d3c7b
assign multiple successors at forks
karasikov Jun 16, 2021
c5feb43
adapted fork_succ bitmap
karasikov Jun 16, 2021
5eec574
always construct rd_succ bitmap
karasikov Jun 21, 2021
7506027
Merge remote-tracking branch 'origin/master' into mk/ann_fork
karasikov Jun 21, 2021
78508ad
assign multiple rd successors only for counts or coordinates
karasikov Jun 21, 2021
5a1a20f
adapted query algorithm for multiple fork successors
karasikov Jun 22, 2021
bdfd7c1
streamlined successor assignment
karasikov Jul 5, 2021
675adfa
optimizations
karasikov Jul 5, 2021
1f50578
minor
karasikov Jul 5, 2021
ccb08a5
Merge branch 'master' into mk/ann_fork
karasikov Jul 7, 2021
36698db
changed formula
karasikov Jul 7, 2021
b7f57f9
cleanup
karasikov Jul 19, 2021
575baac
check only the number of attributes without their width
karasikov Jul 19, 2021
2ac20d1
Merge remote-tracking branch 'origin/master' into mk/ann_fork
karasikov Jul 19, 2021
f01dab1
Merge branch 'master' into mk/ann_fork
karasikov Dec 3, 2021
31eba3e
Merge remote-tracking branch 'origin/master' into mk/ann_fork
karasikov Dec 5, 2021
7facc80
Merge remote-tracking branch 'origin/master' into mk/ann_fork
karasikov Dec 16, 2021
e6113b4
update zlib
karasikov Dec 16, 2021
8b071de
reorganized methods
karasikov Dec 16, 2021
67cd159
moved anchor/fork_succ to IRowDiff
karasikov Dec 16, 2021
fdd7860
minor
karasikov Dec 16, 2021
bdebcd4
fix: don't keep empty tuples
karasikov Dec 16, 2021
36333bc
fix
karasikov Dec 16, 2021
75c9f99
fix
karasikov Dec 17, 2021
16333e4
final fix
karasikov Dec 17, 2021
427706e
cleanup
karasikov Dec 17, 2021
7798135
final final fix
karasikov Dec 18, 2021
a06ab67
cleanup
karasikov Dec 18, 2021
8efd48c
Merge remote-tracking branch 'origin/master' into mk/ann_fork
karasikov Dec 18, 2021
e3da36a
Merge remote-tracking branch 'origin/master' into mk/ann_fork
karasikov Dec 18, 2021
78d258e
Merge remote-tracking branch 'origin/master' into mk/ann_fork
karasikov Jan 14, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions metagraph/src/annotation/annotation_converters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1157,7 +1157,8 @@ void convert_to_row_diff(const std::vector<std::string> &files,

if (construction_stage == RowDiffStage::CONVERT) {
assign_anchors(graph_fname, graph_fname, out_dir, max_path_length,
".row_reduction", get_num_threads());
".row_reduction", get_num_threads(),
with_values || with_coordinates);

const std::string anchors_fname = graph_fname + kRowDiffAnchorExt;
if (!fs::exists(anchors_fname)) {
Expand All @@ -1172,6 +1173,20 @@ void convert_to_row_diff(const std::vector<std::string> &files,
return;
}
mem_bytes -= anchor_size;

const std::string rd_succ_fname = graph_fname + kRowDiffForkSuccExt;
if (!fs::exists(rd_succ_fname)) {
logger->error("Can't find row-diff successor bitmap at {}", rd_succ_fname);
exit(1);
}
uint64_t rd_succ_size = fs::file_size(rd_succ_fname);
if (rd_succ_size > mem_bytes) {
logger->warn("row-diff successor bitmap ({} MiB) is larger than"
karasikov marked this conversation as resolved.
Show resolved Hide resolved
" the memory allocated ({} MiB). Reserve more RAM.",
rd_succ_size >> 20, mem_bytes >> 20);
return;
}
mem_bytes -= rd_succ_size;
}

if (!files.size())
Expand Down Expand Up @@ -1230,8 +1245,7 @@ void convert_to_row_diff(const std::vector<std::string> &files,
}

Timer timer;
logger->trace("Annotations in batch: {}",
file_batch.size());
logger->trace("Annotations in batch: {}", file_batch.size());

if (construction_stage == RowDiffStage::COUNT_LABELS) {
count_labels_per_row(file_batch, count_vector_fname, with_coordinates);
Expand Down
33 changes: 10 additions & 23 deletions metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include "common/logger.hpp"
#include "common/utils/template_utils.hpp"
#include "graph/annotated_dbg.hpp"
#include "graph/representation/succinct/boss.hpp"
#include "graph/representation/succinct/dbg_succinct.hpp"


Expand Down Expand Up @@ -110,7 +109,7 @@ template <class BaseMatrix>
bool RowDiff<BaseMatrix>::get(Row row, Column column) const {
assert(graph_ && "graph must be loaded");
assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded");
assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size());
assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1);

SetBitPositions set_bits = get_row(row);
SetBitPositions::iterator v = std::lower_bound(set_bits.begin(), set_bits.end(), column);
Expand All @@ -125,7 +124,7 @@ template <class BaseMatrix>
std::vector<BinaryMatrix::Row> RowDiff<BaseMatrix>::get_column(Column column) const {
assert(graph_ && "graph must be loaded");
assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded");
assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size());
assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1);

// TODO: implement a more efficient algorithm
std::vector<Row> result;
Expand All @@ -140,21 +139,16 @@ template <class BaseMatrix>
BinaryMatrix::SetBitPositions RowDiff<BaseMatrix>::get_row(Row row) const {
assert(graph_ && "graph must be loaded");
assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded");
assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size());
assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1);

Vector<uint64_t> result = diffs_.get_row(row);
std::sort(result.begin(), result.end());

uint64_t boss_edge = graph_->kmer_to_boss_index(
graph::AnnotatedSequenceGraph::anno_to_graph_index(row));
const graph::boss::BOSS &boss = graph_->get_boss();
const bit_vector &rd_succ = fork_succ_.size() ? fork_succ_ : boss.get_last();
auto node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row);

while (!anchor_[row]) {
boss_edge = boss.row_diff_successor(boss_edge, rd_succ);

row = graph::AnnotatedSequenceGraph::graph_to_anno_index(
graph_->boss_to_kmer_index(boss_edge));
node = graph_->row_diff_successor(node, fork_succ_);
row = graph::AnnotatedSequenceGraph::graph_to_anno_index(node);

auto diff_row = diffs_.get_row(row);
std::sort(diff_row.begin(), diff_row.end());
Expand All @@ -169,7 +163,7 @@ std::vector<BinaryMatrix::SetBitPositions>
RowDiff<BaseMatrix>::get_rows(const std::vector<Row> &row_ids) const {
assert(graph_ && "graph must be loaded");
assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded");
assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size());
assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1);

// diff rows annotating nodes along the row-diff paths
std::vector<Row> rd_ids;
Expand All @@ -184,19 +178,10 @@ RowDiff<BaseMatrix>::get_rows(const std::vector<Row> &row_ids) const {
// been reached before, and thus, will be reconstructed before this one.
std::vector<std::vector<size_t>> rd_paths_trunc(row_ids.size());

const graph::boss::BOSS &boss = graph_->get_boss();
const bit_vector &rd_succ = fork_succ_.size() ? fork_succ_ : boss.get_last();

for (size_t i = 0; i < row_ids.size(); ++i) {
Row row = row_ids[i];

graph::boss::BOSS::edge_index boss_edge = graph_->kmer_to_boss_index(
graph::AnnotatedSequenceGraph::anno_to_graph_index(row));

while (true) {
row = graph::AnnotatedSequenceGraph::graph_to_anno_index(
graph_->boss_to_kmer_index(boss_edge));

auto [it, is_new] = node_to_rd.try_emplace(row, rd_ids.size());
rd_paths_trunc[i].push_back(it.value());

Expand All @@ -212,7 +197,9 @@ RowDiff<BaseMatrix>::get_rows(const std::vector<Row> &row_ids) const {
if (anchor_[row])
break;

boss_edge = boss.row_diff_successor(boss_edge, rd_succ);
auto node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row);
node = graph_->row_diff_successor(node, fork_succ_);
row = graph::AnnotatedSequenceGraph::graph_to_anno_index(node);
}
}

Expand Down
101 changes: 42 additions & 59 deletions metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include "common/logger.hpp"
#include "common/utils/template_utils.hpp"
#include "graph/annotated_dbg.hpp"
#include "graph/representation/succinct/boss.hpp"
#include "graph/representation/succinct/dbg_succinct.hpp"
#include "annotation/binary_matrix/row_diff/row_diff.hpp"
#include "annotation/int_matrix/base/int_matrix.hpp"
Expand Down Expand Up @@ -99,7 +98,7 @@ template <class BaseMatrix>
std::vector<IntMatrix::Row> IntRowDiff<BaseMatrix>::get_column(Column j) const {
assert(graph_ && "graph must be loaded");
assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded");
assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size());
assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1);

// TODO: implement a more efficient algorithm
std::vector<Row> result;
Expand All @@ -122,36 +121,7 @@ IntMatrix::SetBitPositions IntRowDiff<BaseMatrix>::get_row(Row i) const {

template <class BaseMatrix>
IntMatrix::RowValues IntRowDiff<BaseMatrix>::get_row_values(Row row) const {
assert(graph_ && "graph must be loaded");
assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded");
assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size());

RowValues result = diffs_.get_row_values(row);
decode_diffs(&result);
std::sort(result.begin(), result.end());

uint64_t boss_edge = graph_->kmer_to_boss_index(
graph::AnnotatedSequenceGraph::anno_to_graph_index(row));
const graph::boss::BOSS &boss = graph_->get_boss();
const bit_vector &rd_succ = fork_succ_.size() ? fork_succ_ : boss.get_last();

while (!anchor_[row]) {
boss_edge = boss.row_diff_successor(boss_edge, rd_succ);

row = graph::AnnotatedSequenceGraph::graph_to_anno_index(
graph_->boss_to_kmer_index(boss_edge));

RowValues diff_row = diffs_.get_row_values(row);
decode_diffs(&diff_row);
std::sort(diff_row.begin(), diff_row.end());
add_diff(diff_row, &result);
}

assert(std::all_of(result.begin(), result.end(),
[](auto &p) { return p.second; }));
assert(std::all_of(result.begin(), result.end(),
[](auto &p) { return (int64_t)p.second > 0; }));
return result;
return get_row_values(std::vector<Row>{ row })[0];
}

template <class BaseMatrix>
Expand All @@ -176,7 +146,7 @@ std::vector<IntMatrix::RowValues>
IntRowDiff<BaseMatrix>::get_row_values(const std::vector<Row> &row_ids) const {
assert(graph_ && "graph must be loaded");
assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded");
assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size());
assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1);

// diff rows annotating nodes along the row-diff paths
std::vector<Row> rd_ids;
Expand All @@ -189,45 +159,58 @@ IntRowDiff<BaseMatrix>::get_row_values(const std::vector<Row> &row_ids) const {
// Truncated row-diff paths, indexes to |rd_rows|.
// The last index in each path points to an anchor or to a row which had
// been reached before, and thus, will be reconstructed before this one.
std::vector<std::vector<size_t>> rd_paths_trunc(row_ids.size());

const graph::boss::BOSS &boss = graph_->get_boss();
const bit_vector &rd_succ = fork_succ_.size() ? fork_succ_ : boss.get_last();
std::vector<std::vector<std::pair<size_t, size_t>>> rd_paths_trunc(row_ids.size());

for (size_t i = 0; i < row_ids.size(); ++i) {
Row row = row_ids[i];

graph::boss::BOSS::edge_index boss_edge = graph_->kmer_to_boss_index(
graph::AnnotatedSequenceGraph::anno_to_graph_index(row));

while (true) {
row = graph::AnnotatedSequenceGraph::graph_to_anno_index(
graph_->boss_to_kmer_index(boss_edge));

std::vector<std::pair<size_t, size_t>> &rd_path = rd_paths_trunc[i];

std::vector<size_t> path;
Vector<std::pair<size_t, Row>> queue;
queue.emplace_back(0, row_ids[i]);

while (queue.size()) {
size_t depth = queue.back().first;
Row row = queue.back().second;
queue.pop_back();
while (depth < path.size()) {
assert(path.size() > 1);
rd_path.emplace_back(*(path.rbegin() + 1), *path.rbegin());
path.pop_back();
}
auto [it, is_new] = node_to_rd.try_emplace(row, rd_ids.size());
rd_paths_trunc[i].push_back(it.value());

path.push_back(it.value());
// If a node had been reached before, we interrupt the diff path.
// The annotation for that node will have been reconstructed earlier
// than for other nodes in this path as well. Thus, we will start
// reconstruction from that node and don't need its successors.
if (!is_new)
break;
continue;

rd_ids.push_back(row);

if (anchor_[row])
break;
continue;

auto node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row);
graph_->call_row_diff_successors(node, fork_succ_, [&](auto succ) {
queue.emplace_back(depth + 1, graph::AnnotatedSequenceGraph::graph_to_anno_index(succ));
});
}

boss_edge = boss.row_diff_successor(boss_edge, rd_succ);
while (path.size() > 1) {
rd_path.emplace_back(*(path.rbegin() + 1), *path.rbegin());
path.pop_back();
}
assert(path.size());
rd_path.emplace_back(-1, path[0]);
}

node_to_rd = VectorMap<Row, size_t>();

std::vector<RowValues> rd_rows = diffs_.get_row_values(rd_ids);
for (auto &row : rd_rows) {
decode_diffs(&row);
std::sort(row.begin(), row.end());
}

rd_ids = std::vector<Row>();
Expand All @@ -236,17 +219,17 @@ IntRowDiff<BaseMatrix>::get_row_values(const std::vector<Row> &row_ids) const {
std::vector<RowValues> rows(row_ids.size());

for (size_t i = 0; i < row_ids.size(); ++i) {
RowValues &result = rows[i];
const auto &rd_path = rd_paths_trunc[i];
// propagate back and reconstruct full annotations for predecessors
for (auto it = rd_paths_trunc[i].rbegin(); it != rd_paths_trunc[i].rend(); ++it) {
std::sort(rd_rows[*it].begin(), rd_rows[*it].end());
add_diff(rd_rows[*it], &result);
// replace diff row with full reconstructed annotation
rd_rows[*it] = result;
for (size_t j = 0; j + 1 < rd_path.size(); ++j) {
auto [node, succ] = rd_path[j];
// reconstruct annotation by adding the diff (full succ + diff)
add_diff(rd_rows[succ], &rd_rows[node]);
}
assert(std::all_of(result.begin(), result.end(),
rows[i] = rd_rows[rd_path.back().second];
assert(std::all_of(rows[i].begin(), rows[i].end(),
[](auto &p) { return p.second; }));
assert(std::all_of(result.begin(), result.end(),
assert(std::all_of(rows[i].begin(), rows[i].end(),
[](auto &p) { return (int64_t)p.second > 0; }));
}

Expand Down
Loading