From e1001dedd5ee69a13c64c7fc657d5eb7e0a37b81 Mon Sep 17 00:00:00 2001 From: Mikhail Karasikov Date: Tue, 20 Jul 2021 10:47:10 +0100 Subject: [PATCH 01/35] Query diff-transformed coordinate annotations (#338) --- metagraph/integration_tests/base.py | 3 +- metagraph/integration_tests/test_query.py | 1 + .../annotation/int_matrix/base/int_matrix.hpp | 3 - .../int_matrix/row_diff/int_row_diff.hpp | 2 +- .../int_matrix/row_diff/tuple_row_diff.hpp | 308 ++++++++++++++++++ .../annotation_matrix/annotation_matrix.cpp | 2 + .../static_annotators_def.hpp | 5 + metagraph/src/annotation/row_diff_builder.cpp | 2 +- metagraph/src/cli/config/config.cpp | 7 +- metagraph/src/cli/config/config.hpp | 1 + metagraph/src/cli/load/load_annotation.cpp | 7 + metagraph/src/cli/transform_annotation.cpp | 56 +++- 12 files changed, 379 insertions(+), 18 deletions(-) create mode 100644 metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp diff --git a/metagraph/integration_tests/base.py b/metagraph/integration_tests/base.py index 84f3f5aafe..65c0d972e0 100644 --- a/metagraph/integration_tests/base.py +++ b/metagraph/integration_tests/base.py @@ -141,6 +141,7 @@ def _annotate_graph(input, graph_path, output, anno_repr, {output + anno_file_extension[anno_repr]}' other_args = ' --count-kmers' if with_counts else '' + other_args += ' --coordinates' if final_anno.endswith('_coord') else '' if target_anno == 'row_diff': command += ' -i ' + graph_path @@ -170,7 +171,7 @@ def _annotate_graph(input, graph_path, output, anno_repr, assert(res.returncode == 0) if final_anno != target_anno: - rd_type = 'column' if with_counts else 'row_diff' + rd_type = 'column' if with_counts or final_anno.endswith('_coord') else 'row_diff' command = f'{METAGRAPH} transform_anno --anno-type {final_anno} --greedy -o {output} ' \ f'-i {graph_path} -p {NUM_THREADS} {output}.{rd_type}.annodbg' res = subprocess.run([command], shell=True) diff --git a/metagraph/integration_tests/test_query.py b/metagraph/integration_tests/test_query.py index 159cab88e3..17f759b974 100644 --- a/metagraph/integration_tests/test_query.py +++ b/metagraph/integration_tests/test_query.py @@ -18,6 +18,7 @@ anno_file_extension = {'column': '.column.annodbg', 'column_coord': '.column_coord.annodbg', + 'row_diff_coord': '.row_diff_coord.annodbg', 'row': '.row.annodbg', 'row_diff': '.row_diff.annodbg', 'row_sparse': '.row_sparse.annodbg', diff --git a/metagraph/src/annotation/int_matrix/base/int_matrix.hpp b/metagraph/src/annotation/int_matrix/base/int_matrix.hpp index 00beb51f6e..c28e5aa7ec 100644 --- a/metagraph/src/annotation/int_matrix/base/int_matrix.hpp +++ b/metagraph/src/annotation/int_matrix/base/int_matrix.hpp @@ -52,9 +52,6 @@ class MultiIntMatrix : public IntMatrix { virtual std::vector get_row_tuples(const std::vector &rows) const = 0; - - virtual bool load_tuples(std::istream &in) = 0; - virtual void serialize_tuples(std::ostream &out) const = 0; }; } // namespace matrix diff --git a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp index 3f5655d76d..a6d8ecdaab 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp @@ -49,7 +49,7 @@ class IntRowDiff : public binmat::IRowDiff, public IntMatrix { public: using anchor_bv_type = bit_vector_small; using fork_succ_bv_type = bit_vector_small; - static_assert(std::is_convertible::value); + static_assert(std::is_convertible::value); IntRowDiff() {} diff --git a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp new file mode 100644 index 0000000000..10c4ee62db --- /dev/null +++ b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp @@ -0,0 +1,308 @@ +#ifndef __TUPLE_ROW_DIFF_HPP__ +#define __TUPLE_ROW_DIFF_HPP__ + +#include +#include +#include +#include +#include + +#include "common/vectors/bit_vector_adaptive.hpp" +#include "common/vector_map.hpp" +#include "common/vector.hpp" +#include "common/logger.hpp" +#include "common/utils/template_utils.hpp" +#include "graph/annotated_dbg.hpp" +#include "graph/representation/succinct/boss.hpp" +#include "graph/representation/succinct/dbg_succinct.hpp" +#include "annotation/binary_matrix/row_diff/row_diff.hpp" +#include "annotation/int_matrix/base/int_matrix.hpp" + + +namespace mtg { +namespace annot { +namespace matrix { + +template +class TupleRowDiff : public binmat::IRowDiff, public MultiIntMatrix { + public: + using anchor_bv_type = bit_vector_small; + using fork_succ_bv_type = bit_vector_small; + static_assert(std::is_convertible::value); + static const int SHIFT = 1; // coordinates increase by 1 at each edge + + TupleRowDiff() {} + + TupleRowDiff(const graph::DBGSuccinct *graph, BaseMatrix&& diff) + : diffs_(std::move(diff)) { graph_ = graph; } + + bool get(Row i, Column j) const override; + std::vector get_column(Column j) const override; + SetBitPositions get_row(Row i) const override; + std::vector get_rows(const std::vector &rows) const override; + RowTuples get_row_tuples(Row i) const override; + std::vector get_row_tuples(const std::vector &rows) const override; + + uint64_t num_columns() const override { return diffs_.num_columns(); } + uint64_t num_relations() const override { return diffs_.num_relations(); } + uint64_t num_attributes() const override { return diffs_.num_attributes(); } + uint64_t num_rows() const override { return diffs_.num_rows(); } + + bool load(std::istream &in) override; + void serialize(std::ostream &out) const override; + + void load_fork_succ(const std::string &filename); + void load_anchor(const std::string &filename); + + const anchor_bv_type& anchor() const { return anchor_; } + const BaseMatrix& diffs() const { return diffs_; } + BaseMatrix& diffs() { return diffs_; } + + private: + static void decode_diffs(RowTuples *diffs); + static void add_diff(const RowTuples &diff, RowTuples *row); + + BaseMatrix diffs_; + anchor_bv_type anchor_; + fork_succ_bv_type fork_succ_; +}; + + +template +bool TupleRowDiff::get(Row i, Column j) const { + SetBitPositions set_bits = get_row(i); + auto v = std::lower_bound(set_bits.begin(), set_bits.end(), j); + return v != set_bits.end() && *v == j; +} + +template +std::vector TupleRowDiff::get_column(Column j) const { + assert(graph_ && "graph must be loaded"); + assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); + assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size()); + + // TODO: implement a more efficient algorithm + std::vector result; + for (Row i = 0; i < num_rows(); ++i) { + if (get(i, j)) + result.push_back(i); + } + return result; +} + +template +MultiIntMatrix::SetBitPositions TupleRowDiff::get_row(Row i) const { + RowTuples row = get_row_tuples(i); + SetBitPositions result(row.size()); + for (size_t k = 0; k < row.size(); ++k) { + result[k] = row[k].first; + } + return result; +} + +template +std::vector +TupleRowDiff::get_rows(const std::vector &row_ids) const { + std::vector result; + result.reserve(row_ids.size()); + + for (auto&& row : get_row_tuples(row_ids)) { + result.emplace_back(row.size()); + for (size_t k = 0; k < row.size(); ++k) { + result.back()[k] = row[k].first; + } + row = RowTuples(); + } + + return result; +} + +template +MultiIntMatrix::RowTuples TupleRowDiff::get_row_tuples(Row row) const { + return get_row_tuples(std::vector{ row })[0]; +} + +template +std::vector +TupleRowDiff::get_row_tuples(const std::vector &row_ids) const { + assert(graph_ && "graph must be loaded"); + assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); + assert(!fork_succ_.size() || fork_succ_.size() == graph_->get_boss().get_last().size()); + + const size_t RD_PATH_RESERVE_SIZE = 2; + + // diff rows annotating nodes along the row-diff paths + std::vector rd_ids; + rd_ids.reserve(row_ids.size() * RD_PATH_RESERVE_SIZE); + + // map row index to its index in |rd_rows| + VectorMap node_to_rd; + node_to_rd.reserve(row_ids.size() * RD_PATH_RESERVE_SIZE); + + // Truncated row-diff paths, indexes to |rd_rows|. + // The last index in each path points to an anchor or to a row which had + // been reached before, and thus, will be reconstructed before this one. + std::vector> rd_paths_trunc(row_ids.size()); + + const graph::boss::BOSS &boss = graph_->get_boss(); + const bit_vector &rd_succ = fork_succ_.size() ? fork_succ_ : boss.get_last(); + + for (size_t i = 0; i < row_ids.size(); ++i) { + Row row = row_ids[i]; + + graph::boss::BOSS::edge_index boss_edge = graph_->kmer_to_boss_index( + graph::AnnotatedSequenceGraph::anno_to_graph_index(row)); + + while (true) { + row = graph::AnnotatedSequenceGraph::graph_to_anno_index( + graph_->boss_to_kmer_index(boss_edge)); + + auto [it, is_new] = node_to_rd.try_emplace(row, rd_ids.size()); + rd_paths_trunc[i].push_back(it.value()); + + // If a node had been reached before, we interrupt the diff path. + // The annotation for that node will have been reconstructed earlier + // than for other nodes in this path as well. Thus, we will start + // reconstruction from that node and don't need its successors. + if (!is_new) + break; + + rd_ids.push_back(row); + + if (anchor_[row]) + break; + + boss_edge = boss.row_diff_successor(boss_edge, rd_succ); + } + } + + node_to_rd = VectorMap(); + + std::vector rd_rows = diffs_.get_row_tuples(rd_ids); + for (auto &row : rd_rows) { + decode_diffs(&row); + } + + rd_ids = std::vector(); + + // reconstruct annotation rows from row-diff + std::vector rows(row_ids.size()); + + for (size_t i = 0; i < row_ids.size(); ++i) { + RowTuples &result = rows[i]; + + auto it = rd_paths_trunc[i].rbegin(); + std::sort(rd_rows[*it].begin(), rd_rows[*it].end()); + result = rd_rows[*it]; + // propagate back and reconstruct full annotations for predecessors + for (++it ; it != rd_paths_trunc[i].rend(); ++it) { + std::sort(rd_rows[*it].begin(), rd_rows[*it].end()); + add_diff(rd_rows[*it], &result); + // replace diff row with full reconstructed annotation + rd_rows[*it] = result; + } + assert(std::all_of(result.begin(), result.end(), + [](auto &p) { return p.second.size(); })); + } + + return rows; +} + +template +bool TupleRowDiff::load(std::istream &in) { + std::string version(4, '\0'); + in.read(version.data(), 4); + return anchor_.load(in) && fork_succ_.load(in) && diffs_.load(in); +} + +template +void TupleRowDiff::serialize(std::ostream &out) const { + out.write("v2.0", 4); + anchor_.serialize(out); + fork_succ_.serialize(out); + diffs_.serialize(out); +} + +template +void TupleRowDiff::decode_diffs(RowTuples *diffs) { + std::ignore = diffs; + // no encoding +} + +template +void TupleRowDiff::add_diff(const RowTuples &diff, RowTuples *row) { + assert(std::is_sorted(row->begin(), row->end())); + assert(std::is_sorted(diff.begin(), diff.end())); + + if (diff.size()) { + RowTuples result; + result.reserve(row->size() + diff.size()); + + auto it = row->begin(); + auto it2 = diff.begin(); + while (it != row->end() && it2 != diff.end()) { + if (it->first < it2->first) { + result.push_back(*it); + ++it; + } else if (it->first > it2->first) { + result.push_back(*it2); + ++it2; + } else { + if (it2->second.size()) { + result.emplace_back(it->first, Tuple{}); + std::set_symmetric_difference(it->second.begin(), it->second.end(), + it2->second.begin(), it2->second.end(), + std::back_inserter(result.back().second)); + } + ++it; + ++it2; + } + } + std::copy(it, row->end(), std::back_inserter(result)); + std::copy(it2, diff.end(), std::back_inserter(result)); + + row->swap(result); + } + + assert(std::is_sorted(row->begin(), row->end())); + for (auto &[j, tuple] : *row) { + assert(std::is_sorted(tuple.begin(), tuple.end())); + for (uint64_t &c : tuple) { + c -= SHIFT; + } + } +} + +template +void TupleRowDiff::load_anchor(const std::string &filename) { + if (!std::filesystem::exists(filename)) { + common::logger->error("Can't read anchor file: {}", filename); + std::exit(1); + } + std::ifstream in(filename, ios::binary); + if (!in.good()) { + common::logger->error("Could not open anchor file {}", filename); + std::exit(1); + } + anchor_.load(in); +} + +template +void TupleRowDiff::load_fork_succ(const std::string &filename) { + if (!std::filesystem::exists(filename)) { + common::logger->error("Can't read fork successor file: {}", filename); + std::exit(1); + } + std::ifstream in(filename, ios::binary); + if (!in.good()) { + common::logger->error("Could not open fork successor file {}", filename); + std::exit(1); + } + fork_succ_.load(in); +} + +} // namespace matrix +} // namespace annot +} // namespace mtg + +#endif // __TUPLE_ROW_DIFF_HPP__ diff --git a/metagraph/src/annotation/representation/annotation_matrix/annotation_matrix.cpp b/metagraph/src/annotation/representation/annotation_matrix/annotation_matrix.cpp index 32f06e4909..4c52971802 100644 --- a/metagraph/src/annotation/representation/annotation_matrix/annotation_matrix.cpp +++ b/metagraph/src/annotation/representation/annotation_matrix/annotation_matrix.cpp @@ -216,5 +216,7 @@ template class StaticBinRelAnnotator; template class StaticBinRelAnnotator, std::string>; +template class StaticBinRelAnnotator>, std::string>; + } // namespace annot } // namespace mtg diff --git a/metagraph/src/annotation/representation/annotation_matrix/static_annotators_def.hpp b/metagraph/src/annotation/representation/annotation_matrix/static_annotators_def.hpp index d1f53f31ac..7ac28b2589 100644 --- a/metagraph/src/annotation/representation/annotation_matrix/static_annotators_def.hpp +++ b/metagraph/src/annotation/representation/annotation_matrix/static_annotators_def.hpp @@ -15,6 +15,7 @@ #include "annotation/binary_matrix/row_vector/unique_row_binmat.hpp" #include "annotation/int_matrix/rank_extended/csc_matrix.hpp" #include "annotation/int_matrix/row_diff/int_row_diff.hpp" +#include "annotation/int_matrix/row_diff/tuple_row_diff.hpp" #include "annotation/int_matrix/csr_matrix/csr_matrix.hpp" #include "annotation/int_matrix/rank_extended/tuple_csc_matrix.hpp" @@ -54,6 +55,8 @@ typedef StaticBinRelAnnotator IntRowAnnotator; typedef StaticBinRelAnnotator, std::string> ColumnCoordAnnotator; +typedef StaticBinRelAnnotator>, std::string> RowDiffCoordAnnotator; + template <> inline const std::string RowFlatAnnotator::kExtension = ".flat.annodbg"; @@ -85,6 +88,8 @@ template <> inline const std::string IntRowAnnotator::kExtension = ".int_csr.annodbg"; template <> inline const std::string ColumnCoordAnnotator::kExtension = ".column_coord.annodbg"; +template <> +inline const std::string RowDiffCoordAnnotator::kExtension = ".row_diff_coord.annodbg"; } // namespace annot } // namespace mtg diff --git a/metagraph/src/annotation/row_diff_builder.cpp b/metagraph/src/annotation/row_diff_builder.cpp index aee80ac04c..2599606cc7 100644 --- a/metagraph/src/annotation/row_diff_builder.cpp +++ b/metagraph/src/annotation/row_diff_builder.cpp @@ -86,8 +86,8 @@ void load_coordinates(const std::vector &source_files, bit_vector_smart delims; for (size_t j = 0; j < sources[i].num_labels(); ++j) { try { - coords.load(in); delims.load(in); + coords.load(in); } catch (...) { logger->error("Couldn't read coordinates from {}", coords_fname); exit(1); diff --git a/metagraph/src/cli/config/config.cpp b/metagraph/src/cli/config/config.cpp index d5c2536db4..f9a5172b7d 100644 --- a/metagraph/src/cli/config/config.cpp +++ b/metagraph/src/cli/config/config.cpp @@ -583,7 +583,8 @@ Config::Config(int argc, char *argv[]) { const bool to_row_diff = anno_type == RowDiff || anno_type == RowDiffBRWT || anno_type == IntRowDiffBRWT - || anno_type == RowDiffRowSparse; + || anno_type == RowDiffRowSparse + || anno_type == RowDiffCoord; if (to_row_diff && !infbase.size()) { std::cerr << "Path to graph must be passed with '-i '" << std::endl; print_usage_and_exit = true; @@ -704,6 +705,8 @@ std::string Config::annotype_to_string(AnnotationType state) { return "row_diff_int_brwt"; case ColumnCoord: return "column_coord"; + case RowDiffCoord: + return "row_diff_coord"; } throw std::runtime_error("Never happens"); } @@ -739,6 +742,8 @@ Config::AnnotationType Config::string_to_annotype(const std::string &string) { return AnnotationType::IntRowDiffBRWT; } else if (string == "column_coord") { return AnnotationType::ColumnCoord; + } else if (string == "row_diff_coord") { + return AnnotationType::RowDiffCoord; } else { std::cerr << "Error: unknown annotation representation" << std::endl; exit(1); diff --git a/metagraph/src/cli/config/config.hpp b/metagraph/src/cli/config/config.hpp index 4a19a3ab32..9ae8706158 100644 --- a/metagraph/src/cli/config/config.hpp +++ b/metagraph/src/cli/config/config.hpp @@ -201,6 +201,7 @@ class Config { IntBRWT, IntRowDiffBRWT, ColumnCoord, + RowDiffCoord, }; enum GraphType { diff --git a/metagraph/src/cli/load/load_annotation.cpp b/metagraph/src/cli/load/load_annotation.cpp index d9865ad60a..735fca17e2 100644 --- a/metagraph/src/cli/load/load_annotation.cpp +++ b/metagraph/src/cli/load/load_annotation.cpp @@ -24,6 +24,9 @@ Config::AnnotationType parse_annotation_type(const std::string &filename) { } else if (utils::ends_with(filename, annot::ColumnCoordAnnotator::kExtension)) { return Config::AnnotationType::ColumnCoord; + } else if (utils::ends_with(filename, annot::RowDiffCoordAnnotator::kExtension)) { + return Config::AnnotationType::RowDiffCoord; + } else if (utils::ends_with(filename, annot::RowDiffColumnAnnotator::kExtension)) { return Config::AnnotationType::RowDiff; @@ -144,6 +147,10 @@ initialize_annotation(Config::AnnotationType anno_type, annotation.reset(new annot::ColumnCoordAnnotator()); break; } + case Config::RowDiffCoord: { + annotation.reset(new annot::RowDiffCoordAnnotator()); + break; + } } return annotation; diff --git a/metagraph/src/cli/transform_annotation.cpp b/metagraph/src/cli/transform_annotation.cpp index 402cd744e3..ccbcc83298 100644 --- a/metagraph/src/cli/transform_annotation.cpp +++ b/metagraph/src/cli/transform_annotation.cpp @@ -673,6 +673,45 @@ int transform_annotation(Config *config) { logger->trace("Serialized to {}", config->outfbase); break; } + case Config::RowDiffCoord: { + assert(config->infbase.size()); + const std::string anchors_file = config->infbase + annot::binmat::kRowDiffAnchorExt; + if (!std::filesystem::exists(anchors_file)) { + logger->error("Anchor bitmap {} does not exist. Run the row_diff" + " transform followed by anchor optimization.", anchors_file); + std::exit(1); + } + const std::string fork_succ_file = config->infbase + annot::binmat::kRowDiffForkSuccExt; + if (!std::filesystem::exists(fork_succ_file)) { + logger->error("Fork successor bitmap {} does not exist", fork_succ_file); + std::exit(1); + } + + auto label_encoder = annotator->get_label_encoder(); + using RDMatrix = matrix::TupleRowDiff>; + auto tuple_matrix = std::make_unique(nullptr, annotator->release_matrix()); + tuple_matrix->load_anchor(anchors_file); + tuple_matrix->load_fork_succ(fork_succ_file); + logger->trace("RowDiff support bitmaps loaded"); + + if (files.size() > 1) { + logger->error("Merging coordinates from multiple columns is not supported"); + exit(1); + } + auto coords_fname = utils::remove_suffix(files.at(0), + ColumnCompressed<>::kExtension) + + ColumnCompressed<>::kCoordExtension; + std::ifstream in(coords_fname); + tuple_matrix->diffs().load_tuples(in); + + RowDiffCoordAnnotator annotation(std::move(tuple_matrix), label_encoder); + + logger->trace("Annotation converted in {} sec", timer.elapsed()); + + annotation.serialize(config->outfbase); + logger->trace("Serialized to {}", config->outfbase); + break; + } case Config::RowDiffBRWT: { logger->error("Convert to row_diff first, and then to row_diff_brwt"); return 0; @@ -774,20 +813,15 @@ int transform_annotation(Config *config) { auto int_annotation = convert_to_IntMultiBRWT(files, *config, timer); logger->trace("Annotation converted in {} sec", timer.elapsed()); + auto label_encoder = int_annotation.get_label_encoder(); using CSCMatrix = matrix::CSCMatrix; - - IntRowDiffBRWTAnnotator annotation( - std::make_unique>(nullptr, - std::move(*int_annotation.release_matrix())), - int_annotation.get_label_encoder()); - - const_cast &>(annotation.get_matrix()) - .load_anchor(anchors_file); - const_cast &>(annotation.get_matrix()) - .load_fork_succ(fork_succ_file); - + auto matrix = std::make_unique>(nullptr, + std::move(*int_annotation.release_matrix())); + matrix->load_anchor(anchors_file); + matrix->load_fork_succ(fork_succ_file); logger->trace("RowDiff support bitmaps loaded"); + IntRowDiffBRWTAnnotator annotation(std::move(matrix), std::move(label_encoder)); annotation.serialize(config->outfbase); logger->trace("Serialized to {}", config->outfbase); break; From 85d66a3c0ebf24d80ec1b70f5fe9ce5b5150024b Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Sat, 24 Jul 2021 15:54:54 +0200 Subject: [PATCH 02/35] minor cleanup --- metagraph/src/cli/align.cpp | 4 ++-- metagraph/src/cli/load/load_annotated_graph.cpp | 13 +++---------- metagraph/src/common/bloom_filter.cpp | 5 ----- metagraph/src/common/utils/simd_utils.hpp | 9 ++------- metagraph/src/graph/annotated_graph_algorithm.cpp | 1 - 5 files changed, 7 insertions(+), 25 deletions(-) diff --git a/metagraph/src/cli/align.cpp b/metagraph/src/cli/align.cpp index ef25a15b0d..f290ec7053 100644 --- a/metagraph/src/cli/align.cpp +++ b/metagraph/src/cli/align.cpp @@ -1,5 +1,7 @@ #include "align.hpp" +#include + #include "common/logger.hpp" #include "common/unix_tools.hpp" #include "common/threads/threading.hpp" @@ -12,8 +14,6 @@ #include "config/config.hpp" #include "load/load_graph.hpp" -#include - namespace mtg { namespace cli { diff --git a/metagraph/src/cli/load/load_annotated_graph.cpp b/metagraph/src/cli/load/load_annotated_graph.cpp index 3c9e0ec90c..db8cbd7535 100644 --- a/metagraph/src/cli/load/load_annotated_graph.cpp +++ b/metagraph/src/cli/load/load_annotated_graph.cpp @@ -22,15 +22,14 @@ using mtg::common::logger; std::unique_ptr initialize_annotated_dbg(std::shared_ptr graph, const Config &config) { + uint64_t max_index = graph->max_index(); + const auto *dbg_graph = dynamic_cast(graph.get()); + if (graph->get_mode() == DeBruijnGraph::PRIMARY) { graph = std::make_shared(graph); logger->trace("Primary graph was wrapped into canonical"); } - uint64_t max_index = graph->max_index(); - if (const auto *canonical = dynamic_cast(graph.get())) - max_index = canonical->get_graph().max_index(); - auto annotation_temp = config.infbase_annotators.size() ? initialize_annotation(config.infbase_annotators.at(0), config, 0) : initialize_annotation(config.anno_type, config, max_index); @@ -56,12 +55,6 @@ std::unique_ptr initialize_annotated_dbg(std::shared_ptr(annotation_temp->get_matrix()); if (IRowDiff *row_diff = dynamic_cast(&matrix)) { - const DBGSuccinct *dbg_graph; - if (auto *canonical = dynamic_cast(graph.get())) { - dbg_graph = dynamic_cast(&canonical->get_graph()); - } else { - dbg_graph = dynamic_cast(graph.get()); - } if (!dbg_graph) { logger->error("Only succinct de Bruijn graph representations" " are supported for row-diff annotations"); diff --git a/metagraph/src/common/bloom_filter.cpp b/metagraph/src/common/bloom_filter.cpp index c7491a11ef..44b2ec3d0d 100644 --- a/metagraph/src/common/bloom_filter.cpp +++ b/metagraph/src/common/bloom_filter.cpp @@ -1,11 +1,6 @@ #include "bloom_filter.hpp" #include "common/utils/simd_utils.hpp" - -#ifdef __AVX2__ -#include -#endif - #include "common/serialization.hpp" // used to implement size % 512 diff --git a/metagraph/src/common/utils/simd_utils.hpp b/metagraph/src/common/utils/simd_utils.hpp index b0c2f6632b..98d8ee9284 100644 --- a/metagraph/src/common/utils/simd_utils.hpp +++ b/metagraph/src/common/utils/simd_utils.hpp @@ -1,16 +1,11 @@ #ifndef __SIMD_UTILS_HPP__ #define __SIMD_UTILS_HPP__ -#ifdef __AVX2__ -#include -#endif - -#ifdef __SSE2__ -#include -#endif +#include #include #include +#include // Branch prediction helper macros #ifndef LIKELY diff --git a/metagraph/src/graph/annotated_graph_algorithm.cpp b/metagraph/src/graph/annotated_graph_algorithm.cpp index 5d7129b4be..b5ac80791d 100644 --- a/metagraph/src/graph/annotated_graph_algorithm.cpp +++ b/metagraph/src/graph/annotated_graph_algorithm.cpp @@ -17,7 +17,6 @@ using mtg::common::logger; typedef AnnotatedDBG::node_index node_index; typedef AnnotatedDBG::row_index row_index; typedef AnnotatedDBG::Annotator::Label Label; -typedef Alignment DBGAlignment; std::unique_ptr From 628c1bce10da8a38bba7aaeee7de4876474cc3e7 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Sat, 24 Jul 2021 16:22:03 +0200 Subject: [PATCH 03/35] fix query sequence identifier index when using multiple threads --- metagraph/src/cli/query.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/metagraph/src/cli/query.cpp b/metagraph/src/cli/query.cpp index 13322d0f7d..5ac845a9a6 100644 --- a/metagraph/src/cli/query.cpp +++ b/metagraph/src/cli/query.cpp @@ -971,7 +971,7 @@ ::batched_query_fasta(seq_io::FastaParser &fasta_parser, const uint64_t batch_size = config_.query_batch_size_in_bytes; - std::atomic seq_count = 0; + size_t seq_count = 0; while (it != end) { Timer batch_timer; @@ -1017,11 +1017,13 @@ ::batched_query_fasta(seq_io::FastaParser &fasta_parser, #pragma omp parallel for num_threads(get_num_threads()) schedule(dynamic) for (size_t i = 0; i < seq_batch.size(); ++i) { - callback(query_sequence(seq_count++, seq_batch[i].first, seq_batch[i].second, + callback(query_sequence(seq_count + i, seq_batch[i].first, seq_batch[i].second, *query_graph, config_, config_.batch_align ? aligner_config_.get() : NULL)); } + seq_count += seq_batch.size(); + logger->trace("Batch of {} bytes from '{}' queried in {} sec", num_bytes_read, fasta_parser.get_filename(), batch_timer.elapsed()); } From ce5f2bea154b624e7215aeab18f3a2dd49285e27 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Sat, 24 Jul 2021 22:57:39 +0200 Subject: [PATCH 04/35] clean up Cigar interface --- .../src/graph/alignment/aligner_alignment.cpp | 84 ++++++------------ .../src/graph/alignment/aligner_alignment.hpp | 41 +++------ .../src/graph/alignment/aligner_cigar.cpp | 42 +++++---- .../src/graph/alignment/aligner_cigar.hpp | 81 ++++++++--------- .../src/graph/alignment/aligner_config.cpp | 19 +++- .../alignment/aligner_extender_methods.cpp | 10 +-- metagraph/tests/graph/test_aligner.cpp | 86 ++++++++++--------- 7 files changed, 171 insertions(+), 192 deletions(-) diff --git a/metagraph/src/graph/alignment/aligner_alignment.cpp b/metagraph/src/graph/alignment/aligner_alignment.cpp index 758844e535..440fb5ee0c 100644 --- a/metagraph/src/graph/alignment/aligner_alignment.cpp +++ b/metagraph/src/graph/alignment/aligner_alignment.cpp @@ -45,54 +45,28 @@ Alignment::Alignment(std::string_view query, template void Alignment::append(Alignment&& other) { - assert(query_.data() + query_.size() == other.query_.data()); + assert(query_.data() + query_.size() + other.get_clipping() == other.query_.data()); assert(orientation_ == other.orientation_); - assert(cigar_.empty() || cigar_.back().first != Cigar::CLIPPED); nodes_.insert(nodes_.end(), other.nodes_.begin(), other.nodes_.end()); sequence_ += std::move(other.sequence_); score_ += other.score_; cigar_.append(std::move(other.cigar_)); - query_ = { query_.data(), query_.size() + other.query_.size() }; + + // expand the query window to cover both alignments + query_ = std::string_view(query_.data(), other.query_.end() - query_.begin()); } template -void Alignment::trim_offset() { - if (!offset_ || empty() || cigar_.empty()) - return; - - auto it = cigar_.begin(); - if (it->first == Cigar::CLIPPED) - ++it; - - if (it == cigar_.end()) - return; - - auto jt = nodes_.begin(); - size_t counter = 0; - while (offset_ && it != cigar_.end() && jt != nodes_.end()) { - if (counter == it->second - || it->first == Cigar::CLIPPED - || it->first == Cigar::INSERTION) { - ++it; - counter = 0; - continue; - } - - size_t jump = std::min({ offset_, static_cast(it->second), - static_cast(nodes_.end() - jt) }); - offset_ -= jump; - counter += jump; - jt += jump; - } - - if (jt == nodes_.end()) { - --jt; - ++offset_; - } - - nodes_.erase(nodes_.begin(), jt); +size_t Alignment::trim_offset() { + if (!offset_ || nodes_.size() <= 1) + return 0; + + size_t trim = std::min(offset_, nodes_.size() - 1); + offset_ -= trim; + nodes_.erase(nodes_.begin(), nodes_.begin() + trim); + return trim; } template @@ -235,7 +209,7 @@ void Alignment::reverse_complement(const DeBruijnGraph &graph, assert(graph.get_node_sequence(nodes_[0]).substr(offset_) == sequence_); } - std::reverse(cigar_.begin(), cigar_.end()); + std::reverse(cigar_.data().begin(), cigar_.data().end()); assert(query_rev_comp.size() >= get_clipping() + get_end_clipping()); orientation_ = !orientation_; @@ -253,13 +227,13 @@ Json::Value Alignment::path_json(size_t node_size, Json::Value path; - auto cigar_it = cigar_.begin(); + auto cigar_it = cigar_.data().begin(); if (cigar_.size() && cigar_it->first == Cigar::CLIPPED) { cigar_it++; } size_t cigar_offset = 0; - assert(cigar_it != cigar_.end()); + assert(cigar_it != cigar_.data().end()); int64_t rank = 1; const char *query_start = query_.data(); @@ -283,7 +257,7 @@ Json::Value Alignment::path_json(size_t node_size, mapping["position"] = position; // handle alignment to the first node - while (cur_pos < node_size && cigar_it != cigar_.end()) { + while (cur_pos < node_size && cigar_it != cigar_.data().end()) { assert(cigar_it->second > cigar_offset); size_t next_pos = std::min(node_size, cur_pos + (cigar_it->second - cigar_offset)); @@ -323,7 +297,7 @@ Json::Value Alignment::path_json(size_t node_size, case Cigar::CLIPPED: { ++cigar_it; cigar_offset = 0; - assert(cigar_it == cigar_.end()); + assert(cigar_it == cigar_.data().end()); continue; } } @@ -344,7 +318,7 @@ Json::Value Alignment::path_json(size_t node_size, // handle the rest of the alignment for (auto node_it = nodes_.begin() + 1; node_it != nodes_.end(); ++node_it) { - assert(cigar_it != cigar_.end()); + assert(cigar_it != cigar_.data().end()); assert(cigar_it->second > cigar_offset); Json::Value mapping; @@ -355,7 +329,7 @@ Json::Value Alignment::path_json(size_t node_size, //position["is_reverse"] = false; mapping["position"] = position; - if (cigar_it->first == Cigar::INSERTION) { + if (cigar_it->first == Cigar::INSERTION || cigar_it->first == Cigar::CLIPPED) { Json::Value edit; size_t length = cigar_it->second - cigar_offset; assert(query_start + length < query_end); @@ -367,7 +341,7 @@ Json::Value Alignment::path_json(size_t node_size, ++cigar_it; cigar_offset = 0; mapping["edit"].append(edit); - assert(cigar_it != cigar_.end()); + assert(cigar_it != cigar_.data().end()); } Json::Value edit; @@ -403,8 +377,8 @@ Json::Value Alignment::path_json(size_t node_size, } assert(query_start == query_end); - assert(cigar_it == cigar_.end() - || (cigar_it + 1 == cigar_.end() && cigar_it->first == Cigar::CLIPPED)); + assert(cigar_it == cigar_.data().end() + || (cigar_it + 1 == cigar_.data().end() && cigar_it->first == Cigar::CLIPPED)); path["length"] = Json::Value::UInt64(nodes_.size()); @@ -500,7 +474,7 @@ Json::Value Alignment::to_json(std::string_view full_query, template std::shared_ptr Alignment ::load_from_json(const Json::Value &alignment, const DeBruijnGraph &graph) { - cigar_.clear(); + cigar_ = Cigar(); nodes_.clear(); sequence_.clear(); @@ -526,13 +500,11 @@ ::load_from_json(const Json::Value &alignment, const DeBruijnGraph &graph) { if (nodes_.size() == 1) { sequence_ = graph.get_node_sequence(nodes_.back()).substr(offset_); } else { - graph.call_outgoing_kmers( - *(nodes_.rbegin() + 1), - [&](auto node, char c) { - if (node == nodes_.back()) - sequence_ += c; - } - ); + graph.call_outgoing_kmers(*(nodes_.rbegin() + 1), + [&](auto node, char c) { + if (node == nodes_.back()) + sequence_ += c; + }); } const Json::Value &edits = mapping[i]["edit"]; diff --git a/metagraph/src/graph/alignment/aligner_alignment.hpp b/metagraph/src/graph/alignment/aligner_alignment.hpp index 28831a10e7..439ae766d5 100644 --- a/metagraph/src/graph/alignment/aligner_alignment.hpp +++ b/metagraph/src/graph/alignment/aligner_alignment.hpp @@ -60,7 +60,7 @@ class Alignment { clipping, orientation, offset) { - assert(nodes.empty() || clipping || is_exact_match()); + assert(nodes.empty() || clipping || cigar_.is_exact_match(query_.size())); } // Used for constructing exact match seeds @@ -89,38 +89,23 @@ class Alignment { void set_query_begin(const char *begin) { query_ = { begin, query_.size() }; } void extend_query_begin(const char *begin) { - size_t clipping = get_clipping(); - const char *full_query_begin = query_.data() - clipping; - assert(begin <= full_query_begin); - if (begin == full_query_begin) - return; - - if (clipping) { - cigar_.front().second += full_query_begin - begin; - } else { - cigar_.insert(cigar_.begin(), - std::make_pair(Cigar::CLIPPED, full_query_begin - begin)); - } + const char *full_query_begin = query_.data() - get_clipping(); + assert(full_query_begin >= begin); + if (full_query_begin > begin) + cigar_.extend_clipping(full_query_begin - begin); } void extend_query_end(const char *end) { const char *full_query_end = query_.data() + query_.size() + get_end_clipping(); - assert(end >= full_query_end); - if (end > full_query_end) + assert(full_query_end <= end); + if (full_query_end < end) cigar_.append(Cigar::CLIPPED, end - full_query_end); } - void trim_clipping() { - if (get_clipping()) - cigar_.pop_front(); - } - - void trim_end_clipping() { - if (get_end_clipping()) - cigar_.pop_back(); - } + inline size_t trim_clipping() { return cigar_.trim_clipping(); } + inline size_t trim_end_clipping() { return cigar_.trim_end_clipping(); } - void trim_offset(); + size_t trim_offset(); void reverse_complement(const DeBruijnGraph &graph, std::string_view query_rev_comp); @@ -148,11 +133,6 @@ class Alignment { bool operator!=(const Alignment &other) const { return !(*this == other); } - bool is_exact_match() const { - return cigar_.size() == 1 - && cigar_.front() == Cigar::value_type(Cigar::MATCH, query_.size()); - } - Json::Value to_json(std::string_view query, const DeBruijnGraph &graph, bool is_secondary = false, @@ -168,6 +148,7 @@ class Alignment { private: Json::Value path_json(size_t node_size, std::string_view label = {}) const; + // TODO: rename to query_view_ std::string_view query_; std::vector nodes_; std::string sequence_; diff --git a/metagraph/src/graph/alignment/aligner_cigar.cpp b/metagraph/src/graph/alignment/aligner_cigar.cpp index 322029e72c..6295bea5f7 100644 --- a/metagraph/src/graph/alignment/aligner_cigar.cpp +++ b/metagraph/src/graph/alignment/aligner_cigar.cpp @@ -124,21 +124,26 @@ bool Cigar::is_valid(std::string_view reference, std::string_view query) const { } switch (op.first) { - case Operator::CLIPPED: { + case CLIPPED: { if ((ref_it != reference.begin() || alt_it != query.begin()) && (ref_it != reference.end() || alt_it != query.end())) { - std::cerr << "Internal clipping found in CIGAR" << std::endl - << to_string() << std::endl - << reference << std::endl - << query << std::endl; - return false; + if (alt_it > query.end() - op.second) { + std::cerr << "Query too short after " + << Cigar::opt_to_char(op.first) << std::endl + << to_string() << std::endl + << reference << std::endl + << query << std::endl; + return false; + } + + alt_it += op.second; } } break; - case Operator::MATCH: - // do nothing - case Operator::MISMATCH: { + case MATCH: + case MISMATCH: { if (ref_it > reference.end() - op.second) { - std::cerr << "Reference too short" << std::endl + std::cerr << "Reference too short after " + << Cigar::opt_to_char(op.first) << std::endl << to_string() << std::endl << reference << std::endl << query << std::endl; @@ -146,7 +151,8 @@ bool Cigar::is_valid(std::string_view reference, std::string_view query) const { } if (alt_it > query.end() - op.second) { - std::cerr << "Query too short" << std::endl + std::cerr << "Query too short after " + << Cigar::opt_to_char(op.first) << std::endl << to_string() << std::endl << reference << std::endl << query << std::endl; @@ -165,8 +171,8 @@ bool Cigar::is_valid(std::string_view reference, std::string_view query) const { ref_it += op.second; alt_it += op.second; } break; - case Operator::INSERTION: { - if (i && cigar_[i - 1].first == Operator::DELETION) { + case INSERTION: { + if (i && cigar_[i - 1].first == DELETION) { std::cerr << "INSERTION after DELETION" << std::endl << to_string() << std::endl << reference << std::endl @@ -175,7 +181,8 @@ bool Cigar::is_valid(std::string_view reference, std::string_view query) const { } if (alt_it > query.end() - op.second) { - std::cerr << "Query too short" << std::endl + std::cerr << "Query too short after " + << Cigar::opt_to_char(op.first) << std::endl << to_string() << std::endl << reference << std::endl << query << std::endl; @@ -184,8 +191,8 @@ bool Cigar::is_valid(std::string_view reference, std::string_view query) const { alt_it += op.second; } break; - case Operator::DELETION: { - if (i && cigar_[i - 1].first == Operator::INSERTION) { + case DELETION: { + if (i && cigar_[i - 1].first == INSERTION) { std::cerr << "DELETION after INSERTION" << std::endl << to_string() << std::endl << reference << std::endl @@ -194,7 +201,8 @@ bool Cigar::is_valid(std::string_view reference, std::string_view query) const { } if (ref_it > reference.end() - op.second) { - std::cerr << "Reference too short" << std::endl + std::cerr << "Reference too short after " + << Cigar::opt_to_char(op.first) << std::endl << to_string() << std::endl << reference << std::endl << query << std::endl; diff --git a/metagraph/src/graph/alignment/aligner_cigar.hpp b/metagraph/src/graph/alignment/aligner_cigar.hpp index bc82f859df..03503e0ba6 100644 --- a/metagraph/src/graph/alignment/aligner_cigar.hpp +++ b/metagraph/src/graph/alignment/aligner_cigar.hpp @@ -26,12 +26,13 @@ class Cigar { typedef uint32_t LengthType; typedef std::pair value_type; - Cigar(Operator op = Operator::CLIPPED, LengthType num = 0) + Cigar(Operator op = CLIPPED, LengthType num = 0) : cigar_(num ? 1 : 0, std::make_pair(op, num)) { } // See section 1.4 in https://samtools.github.io/hts-specs/SAMv1.pdf for // a specification of the CIGAR string format. // e.g., 3=1X2I3D for 3 matches, 1 mismatch, 2 insertions, 3 deletions + // The symbol 'G' is introduced to indicate the insertion of a graph node. Cigar(std::string_view cigar_str); size_t size() const { return cigar_.size(); } @@ -42,63 +43,63 @@ class Cigar { void append(Operator op, LengthType num = 1); void append(Cigar&& other); - void pop_front() { - assert(cigar_.size()); - cigar_.erase(cigar_.begin(), cigar_.begin() + 1); + LengthType trim_clipping() { + if (cigar_.size() && cigar_.front().first == CLIPPED) { + LengthType ret_val = cigar_.front().second; + cigar_.erase(cigar_.begin(), cigar_.begin() + 1); + return ret_val; + } else { + return 0; + } } - void pop_back() { - assert(cigar_.size()); - cigar_.pop_back(); + LengthType trim_end_clipping() { + if (cigar_.size() && cigar_.back().first == CLIPPED) { + LengthType ret_val = cigar_.back().second; + cigar_.pop_back(); + return ret_val; + } else { + return 0; + } } - typedef typename std::vector::iterator iterator; - typedef typename std::vector::const_iterator const_iterator; + LengthType get_clipping() const { + return cigar_.size() && cigar_.front().first == CLIPPED ? cigar_.front().second : 0; + } - // This is essentially just a vector, so there's no reason not to have it editable - iterator begin() { return cigar_.begin(); } - iterator end() { return cigar_.end(); } - const_iterator begin() const { return cigar_.cbegin(); } - const_iterator end() const { return cigar_.cend(); } + LengthType get_end_clipping() const { + return cigar_.size() && cigar_.back().first == CLIPPED ? cigar_.back().second : 0; + } - template - void insert(iterator it, Args&&... args) { - cigar_.insert(it, std::forward(args)...); + void extend_clipping(LengthType n) { + assert(cigar_.size()); + if (cigar_.front().first != CLIPPED) { + cigar_.insert(cigar_.begin(), value_type(CLIPPED, n)); + } else { + cigar_.front().second += n; + } } - value_type& front() { return cigar_.front(); } - value_type& back() { return cigar_.back(); } - const value_type& front() const { return cigar_.front(); } - const value_type& back() const { return cigar_.back(); } + std::vector& data() { return cigar_; } + const std::vector& data() const { return cigar_; } bool operator==(const Cigar &other) const { return cigar_ == other.cigar_; } - bool operator!=(const Cigar &other) const { return !(*this == other); } - void clear() { cigar_.clear(); } - - LengthType get_clipping() const { - return cigar_.size() && cigar_.front().first == Operator::CLIPPED - ? cigar_.front().second - : 0; - } - - LengthType get_end_clipping() const { - return cigar_.size() && cigar_.back().first == Operator::CLIPPED - ? cigar_.back().second - : 0; - } - size_t get_num_matches() const { - return std::accumulate(begin(), end(), 0, [&](size_t old, const value_type &op) { - return old + (op.first == Operator::MATCH) * op.second; + return std::accumulate(cigar_.begin(), cigar_.end(), 0, + [&](size_t old, const value_type &op) { + return old + (op.first == MATCH) * op.second; }); } - // Return true if the cigar is valid. reference_begin points to the first - // character of the reference sequence after clipping is trimmed + // Return true if the cigar is valid bool is_valid(std::string_view reference, std::string_view query) const; + bool is_exact_match(size_t query_size) const { + return cigar_.size() == 1 && cigar_.front() == value_type{ MATCH, query_size }; + } + static constexpr char opt_to_char(Cigar::Operator op) { return op_str_[op]; } private: diff --git a/metagraph/src/graph/alignment/aligner_config.cpp b/metagraph/src/graph/alignment/aligner_config.cpp index 52946e0be2..32d05c1754 100644 --- a/metagraph/src/graph/alignment/aligner_config.cpp +++ b/metagraph/src/graph/alignment/aligner_config.cpp @@ -62,13 +62,23 @@ ::score_cigar(std::string_view reference, score_t score = 0; assert(cigar.is_valid(reference, query)); + + if (cigar.empty()) + return score; + auto ref_it = reference.begin(); auto alt_it = query.begin(); + auto it = cigar.data().begin(); + if (it->first == Cigar::CLIPPED) + ++it; - for (const auto &op : cigar) { + for ( ; it != cigar.data().end(); ++it) { + const auto &op = *it; switch (op.first) { - case Cigar::CLIPPED: - break; + case Cigar::CLIPPED: { + if (it + 1 != cigar.data().end()) + alt_it += op.second; + } break; case Cigar::MATCH: { score += match_score(std::string_view(ref_it, op.second)); ref_it += op.second; @@ -91,6 +101,9 @@ ::score_cigar(std::string_view reference, } } + assert(ref_it == reference.end()); + assert(alt_it == query.end()); + return score; } diff --git a/metagraph/src/graph/alignment/aligner_extender_methods.cpp b/metagraph/src/graph/alignment/aligner_extender_methods.cpp index d47ebb9da8..34836bb86a 100644 --- a/metagraph/src/graph/alignment/aligner_extender_methods.cpp +++ b/metagraph/src/graph/alignment/aligner_extender_methods.cpp @@ -54,8 +54,8 @@ template void DefaultColumnExtender::initialize(const DBGAlignment &seed) { assert(seed.size()); assert(seed.get_cigar().size()); - assert(seed.get_cigar().back().first == Cigar::MATCH - || seed.get_cigar().back().first == Cigar::MISMATCH); + assert(seed.get_cigar().data().back().first == Cigar::MATCH + || seed.get_cigar().data().back().first == Cigar::MISMATCH); seed_ = &seed; reset(); @@ -291,7 +291,7 @@ bool update_column(const DeBruijnGraph &graph_, std::ignore = seed_; assert(std::get<0>(prev_node) != graph_.max_index() + 1 || std::get<2>(prev_node) || (offset <= 1 && S[1 - offset] == seed_.get_score() - && OS[1 - offset] == seed_.get_cigar().back().first + && OS[1 - offset] == seed_.get_cigar().data().back().first && PS[1 - offset] == Extender::PREV)); return updated; @@ -348,7 +348,7 @@ void backtrack(const Table &table_, if (std::get<0>(prev) != graph_.max_index() + 1 || std::get<0>(best_node) != seed_.back() || std::get<2>(best_node) - || last_op != seed_.get_cigar().back().first) { + || last_op != seed_.get_cigar().data().back().first) { // last op in the seed was skipped // TODO: reconstruct the entire alignment. for now, throw this out return; @@ -416,7 +416,7 @@ void backtrack(const Table &table_, if (pos > 1) cigar.append(Cigar::CLIPPED, pos - 1); - std::reverse(cigar.begin(), cigar.end()); + std::reverse(cigar.data().begin(), cigar.data().end()); std::reverse(path.begin(), path.end()); std::reverse(seq.begin(), seq.end()); diff --git a/metagraph/tests/graph/test_aligner.cpp b/metagraph/tests/graph/test_aligner.cpp index 4e02e48f7d..b4eda117a9 100644 --- a/metagraph/tests/graph/test_aligner.cpp +++ b/metagraph/tests/graph/test_aligner.cpp @@ -22,6 +22,10 @@ using namespace mtg::kmer; const std::string test_data_dir = "../tests/data"; const bool PICK_REV_COMP = true; +inline bool is_exact_match(const Alignment &alignment) { + return alignment.get_cigar().is_exact_match(alignment.get_query().size()); +} + void check_score_matrix(const DBGAlignerConfig &config, const char* alphabet, size_t alph_size) { @@ -136,7 +140,7 @@ TYPED_TEST(DBGAlignerTest, align_big_self_loop) { EXPECT_EQ(config.match_score(query), path.get_score()); EXPECT_EQ("9=", path.get_cigar().to_string()); EXPECT_EQ(9u, path.get_num_matches()); - EXPECT_TRUE(path.is_exact_match()); + EXPECT_TRUE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -164,7 +168,7 @@ TYPED_TEST(DBGAlignerTest, align_single_node) { EXPECT_EQ(config.match_score(query), path.get_score()); EXPECT_EQ("3=", path.get_cigar().to_string()); EXPECT_EQ(3u, path.get_num_matches()); - EXPECT_TRUE(path.is_exact_match()); + EXPECT_TRUE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -192,7 +196,7 @@ TYPED_TEST(DBGAlignerTest, align_straight) { EXPECT_EQ(config.match_score(query), path.get_score()); EXPECT_EQ("14=", path.get_cigar().to_string()); EXPECT_EQ(14u, path.get_num_matches()); - EXPECT_TRUE(path.is_exact_match()); + EXPECT_TRUE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -236,7 +240,7 @@ TYPED_TEST(DBGAlignerTest, align_straight_with_N) { EXPECT_EQ(config.score_sequences(reference, query), path.get_score()); EXPECT_EQ("4=1X9=", path.get_cigar().to_string()); EXPECT_EQ(13u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -269,7 +273,7 @@ TYPED_TEST(DBGAlignerTest, align_straight_forward_and_reverse_complement) { EXPECT_EQ(config.match_score(query), path.get_score()); EXPECT_EQ("14=", path.get_cigar().to_string()); EXPECT_EQ(14u, path.get_num_matches()); - EXPECT_TRUE(path.is_exact_match()); + EXPECT_TRUE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -317,7 +321,7 @@ TYPED_TEST(DBGAlignerTest, align_ending_branch) { EXPECT_EQ(config.match_score(query), path.get_score()); EXPECT_EQ("9=", path.get_cigar().to_string()); EXPECT_EQ(9u, path.get_num_matches()); - EXPECT_TRUE(path.is_exact_match()); + EXPECT_TRUE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -348,7 +352,7 @@ TYPED_TEST(DBGAlignerTest, align_branch) { EXPECT_EQ(config.match_score(query), path.get_score()); EXPECT_EQ("17=", path.get_cigar().to_string()); EXPECT_EQ(17u, path.get_num_matches()); - EXPECT_TRUE(path.is_exact_match()); + EXPECT_TRUE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -379,7 +383,7 @@ TYPED_TEST(DBGAlignerTest, align_branch_with_cycle) { EXPECT_EQ(config.match_score(query), path.get_score()); EXPECT_EQ("17=", path.get_cigar().to_string()); EXPECT_EQ(17u, path.get_num_matches()); - EXPECT_TRUE(path.is_exact_match()); + EXPECT_TRUE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -407,7 +411,7 @@ TYPED_TEST(DBGAlignerTest, repetitive_sequence_alignment) { EXPECT_EQ(config.match_score(query), path.get_score()); EXPECT_EQ("6=", path.get_cigar().to_string()); EXPECT_EQ(6u, path.get_num_matches()); - EXPECT_TRUE(path.is_exact_match()); + EXPECT_TRUE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -436,7 +440,7 @@ TYPED_TEST(DBGAlignerTest, variation) { EXPECT_EQ(config.score_sequences(query, reference), path.get_score()); EXPECT_EQ("5=1X6=", path.get_cigar().to_string()); EXPECT_EQ(11u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -472,7 +476,7 @@ TYPED_TEST(DBGAlignerTest, variation_in_branching_point) { // TODO: what about other cases? EXPECT_EQ("8=3X4=", path.get_cigar().to_string()); EXPECT_EQ(12u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -501,7 +505,7 @@ TYPED_TEST(DBGAlignerTest, multiple_variations) { EXPECT_EQ(config.score_sequences(query, reference), path.get_score()); EXPECT_EQ("6=1X6=1X1=1X4=", path.get_cigar().to_string()); EXPECT_EQ(17u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -534,7 +538,7 @@ TYPED_TEST(DBGAlignerTest, align_noise_in_branching_point) { EXPECT_EQ(reference_1 + "T", path.get_sequence()); EXPECT_EQ("4=1D7=", path.get_cigar().to_string()); EXPECT_EQ(11u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -566,7 +570,7 @@ TYPED_TEST(DBGAlignerTest, alternative_path_basic) { EXPECT_EQ("4=1X4=1X2=", path.get_cigar().to_string()) << query << "\n" << path.get_sequence(); EXPECT_EQ(10u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -596,7 +600,7 @@ TYPED_TEST(DBGAlignerTest, align_multiple_misalignment) { EXPECT_EQ(config.score_sequences(query, reference), path.get_score()); EXPECT_EQ("4=1X9=1X6=", path.get_cigar().to_string()); EXPECT_EQ(19u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -627,7 +631,7 @@ TYPED_TEST(DBGAlignerTest, align_insert_non_existent) { EXPECT_EQ(config.match_score(reference) + config.gap_opening_penalty, path.get_score()); EXPECT_EQ("5=1I5=", path.get_cigar().to_string()); EXPECT_EQ(10u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -659,7 +663,7 @@ TYPED_TEST(DBGAlignerTest, align_insert_multi) { + config.gap_opening_penalty + config.gap_extension_penalty, path.get_score()); EXPECT_EQ("5=2I5=", path.get_cigar().to_string()); EXPECT_EQ(10u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -691,7 +695,7 @@ TYPED_TEST(DBGAlignerTest, align_insert_long) { + score_t(8) * config.gap_extension_penalty, path.get_score()); EXPECT_EQ("5=9I5=", path.get_cigar().to_string()); EXPECT_EQ(10u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -725,7 +729,7 @@ TYPED_TEST(DBGAlignerTest, align_insert_long_offset) { EXPECT_TRUE(path.get_cigar().to_string() == "6=1X9I6=" || path.get_cigar().to_string() == "6=9I1X6=") << path.get_cigar().to_string(); EXPECT_EQ(12u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -751,7 +755,7 @@ TYPED_TEST(DBGAlignerTest, align_delete) { ASSERT_EQ(1ull, paths.size()); auto path = paths[0]; - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(reference.size() - k + 1, path.size()); EXPECT_EQ(reference, path.get_sequence()); EXPECT_EQ(config.match_score(query) + config.gap_opening_penalty, path.get_score()); @@ -761,7 +765,7 @@ TYPED_TEST(DBGAlignerTest, align_delete) { || "5=1D6=" == path.get_cigar().to_string()); // EXPECT_EQ("6=1I5=", path.get_cigar().to_string()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -794,7 +798,7 @@ TYPED_TEST(DBGAlignerTest, align_gap) { + score_t(3) * config.gap_extension_penalty, path.get_score()); EXPECT_EQ("10=4D9=", path.get_cigar().to_string()); EXPECT_EQ(19u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -826,7 +830,7 @@ TYPED_TEST(DBGAlignerTest, align_gap_after_seed) { + score_t(3) * config.gap_extension_penalty, path.get_score()); EXPECT_EQ("4=4D9=", path.get_cigar().to_string()); EXPECT_EQ(13u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -860,7 +864,7 @@ TYPED_TEST(DBGAlignerTest, align_loop_deletion) { + score_t(2) * config.gap_extension_penalty, path.get_score()); EXPECT_EQ("4=3D9=", path.get_cigar().to_string()); EXPECT_EQ(13u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -890,7 +894,7 @@ TYPED_TEST(DBGAlignerTest, align_straight_long_xdrop) { EXPECT_EQ(config.match_score(query), path.get_score()); EXPECT_EQ("63=", path.get_cigar().to_string()); EXPECT_EQ(63u, path.get_num_matches()); - EXPECT_TRUE(path.is_exact_match()); + EXPECT_TRUE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -922,7 +926,7 @@ TYPED_TEST(DBGAlignerTest, align_drop_seed) { EXPECT_EQ(config.match_score(reference.substr(7)), path.get_score()); EXPECT_EQ("7S9=", path.get_cigar().to_string()); EXPECT_EQ(9u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(7u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -954,7 +958,7 @@ TYPED_TEST(DBGAlignerTest, align_long_gap_after_seed) { EXPECT_EQ(config.match_score(query.substr(4)), path.get_score()); EXPECT_EQ("4S9=", path.get_cigar().to_string()); EXPECT_EQ(9u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(4u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -995,7 +999,7 @@ TYPED_TEST(DBGAlignerTest, align_repeat_sequence_no_delete_after_insert) { || path.get_cigar().to_string() == "44=4I1=3I8=1X39=") << path.get_cigar().to_string(); EXPECT_EQ(92u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -1019,7 +1023,7 @@ TYPED_TEST(DBGAlignerTest, align_repeat_sequence_no_delete_after_insert) { || path.get_cigar().to_string() == "44=4I1=3I8=1X39=") << path.get_cigar().to_string(); EXPECT_EQ(92u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -1046,7 +1050,7 @@ TYPED_TEST(DBGAlignerTest, align_clipping1) { EXPECT_EQ("2S8=", path.get_cigar().to_string()) << reference.substr(2) << " " << path.get_sequence(); EXPECT_EQ(8u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(2u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -1075,7 +1079,7 @@ TYPED_TEST(DBGAlignerTest, align_clipping2) { EXPECT_EQ(config.match_score(query.substr(2)), path.get_score()); EXPECT_EQ("2S14=", path.get_cigar().to_string()); EXPECT_EQ(14u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(2u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -1105,7 +1109,7 @@ TYPED_TEST(DBGAlignerTest, align_long_clipping) { EXPECT_EQ(config.match_score(query.substr(7)), path.get_score()); EXPECT_EQ("7S17=", path.get_cigar().to_string()); EXPECT_EQ(17u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(7u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -1134,7 +1138,7 @@ TYPED_TEST(DBGAlignerTest, align_end_clipping) { EXPECT_EQ(config.match_score(query.substr(0, 17)), path.get_score()); EXPECT_EQ("17=7S", path.get_cigar().to_string()); EXPECT_EQ(17u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(7u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -1165,7 +1169,7 @@ TYPED_TEST(DBGAlignerTest, align_clipping_min_cell_score) { EXPECT_EQ(config.match_score(query.substr(2)), path.get_score()); EXPECT_EQ("2S13=", path.get_cigar().to_string()); EXPECT_EQ(13u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(2u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -1193,7 +1197,7 @@ TYPED_TEST(DBGAlignerTest, align_low_similarity) { // EXPECT_EQ(config.match_score(query.substr(2)), path.get_score()); // EXPECT_EQ("2S13=", path.get_cigar().to_string()); // EXPECT_EQ(13u, path.get_num_matches()); - // EXPECT_FALSE(path.is_exact_match()); + // EXPECT_FALSE(is_exact_match(path)); // EXPECT_EQ(2u, path.get_clipping()); // EXPECT_EQ(0u, path.get_end_clipping()); // EXPECT_EQ(0u, path.get_offset()); @@ -1319,7 +1323,7 @@ TEST(DBGAlignerTest, align_suffix_seed_snp_min_seed_length) { EXPECT_EQ(config.match_score(query.substr(2)), path.get_score()); EXPECT_EQ("2S13=", path.get_cigar().to_string()); EXPECT_EQ(13u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(2u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -1353,7 +1357,7 @@ TEST(DBGAlignerTest, align_suffix_seed_snp_min_seed_length) { EXPECT_EQ(config.score_sequences(query, reference.substr(12)), path.get_score()); EXPECT_EQ("1=1X13=", path.get_cigar().to_string()); EXPECT_EQ(14u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -1414,7 +1418,7 @@ TEST(DBGAlignerTest, align_suffix_seed_snp_canonical) { } EXPECT_EQ(5u, path.get_offset()); EXPECT_EQ(13u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_TRUE(path.is_valid(*graph, &config)); check_json_dump_load(*graph, path, paths.get_query(), paths.get_query(PICK_REV_COMP)); @@ -1454,7 +1458,7 @@ TYPED_TEST(DBGAlignerTest, align_both_directions) { } EXPECT_EQ(17u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -1484,7 +1488,7 @@ TYPED_TEST(DBGAlignerTest, align_nodummy) { EXPECT_EQ(config.score_sequences(query.substr(6), reference.substr(6)), path.get_score()); EXPECT_EQ("6S12=", path.get_cigar().to_string()); EXPECT_EQ(12u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(6u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); @@ -1531,7 +1535,7 @@ TEST(DBGAlignerTest, align_dummy) { EXPECT_EQ(config.score_sequences(query, reference), path.get_score()); EXPECT_EQ("5=1X12=", path.get_cigar().to_string()); EXPECT_EQ(17u, path.get_num_matches()); - EXPECT_FALSE(path.is_exact_match()); + EXPECT_FALSE(is_exact_match(path)); EXPECT_EQ(0u, path.get_clipping()); EXPECT_EQ(0u, path.get_end_clipping()); EXPECT_EQ(0u, path.get_offset()); From 2072480129242b9b0040e23883083580ec1a0e13 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Sun, 25 Jul 2021 03:56:26 +0200 Subject: [PATCH 05/35] remove redundant includes --- metagraph/src/cli/align.cpp | 3 +-- metagraph/src/graph/annotated_dbg.cpp | 5 ----- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/metagraph/src/cli/align.cpp b/metagraph/src/cli/align.cpp index f290ec7053..44c865eed6 100644 --- a/metagraph/src/cli/align.cpp +++ b/metagraph/src/cli/align.cpp @@ -9,7 +9,6 @@ #include "graph/representation/canonical_dbg.hpp" #include "graph/alignment/dbg_aligner.hpp" #include "graph/alignment/aligner_seeder_methods.hpp" -#include "graph/alignment/aligner_extender_methods.hpp" #include "seq_io/sequence_io.hpp" #include "config/config.hpp" #include "load/load_graph.hpp" @@ -176,7 +175,7 @@ void map_sequences_in_file(const std::string &file, } else if (config.query_presence || config.count_kmers) { // TODO: make more efficient // TODO: canonicalization - if (dbg->get_mode() == CanonicalDBG::PRIMARY) + if (dbg->get_mode() == DeBruijnGraph::PRIMARY) logger->warn("Sub-k-mers will be mapped to unwrapped primary graph"); for (size_t i = 0; i + graph.get_k() <= read_stream->seq.l; ++i) { diff --git a/metagraph/src/graph/annotated_dbg.cpp b/metagraph/src/graph/annotated_dbg.cpp index 1461a9baf2..bddd5e89d6 100644 --- a/metagraph/src/graph/annotated_dbg.cpp +++ b/metagraph/src/graph/annotated_dbg.cpp @@ -1,11 +1,6 @@ #include "annotated_dbg.hpp" #include - -#ifdef __AVX2__ -#include -#endif - #include #include "graph/representation/canonical_dbg.hpp" From ff972f4415e8f8eddb9772506c407f4b548838f0 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Sun, 25 Jul 2021 04:34:03 +0200 Subject: [PATCH 06/35] Remove template from Alignment --- metagraph/src/cli/align.cpp | 13 ++- .../graph/alignment/aligner_aggregator.hpp | 40 ++++---- .../src/graph/alignment/aligner_alignment.cpp | 62 +++++------- .../src/graph/alignment/aligner_alignment.hpp | 40 ++++---- .../alignment/aligner_extender_methods.cpp | 71 ++++++-------- .../alignment/aligner_extender_methods.hpp | 46 ++++----- .../alignment/aligner_seeder_methods.cpp | 95 ++++++++----------- .../alignment/aligner_seeder_methods.hpp | 48 ++++------ metagraph/src/graph/alignment/dbg_aligner.cpp | 69 +++++++------- metagraph/src/graph/alignment/dbg_aligner.hpp | 70 ++++++-------- metagraph/tests/graph/test_aligner.cpp | 16 ++-- .../tests/graph/test_aligner_helpers.hpp | 19 ++-- 12 files changed, 253 insertions(+), 336 deletions(-) diff --git a/metagraph/src/cli/align.cpp b/metagraph/src/cli/align.cpp index 44c865eed6..af3b70b70a 100644 --- a/metagraph/src/cli/align.cpp +++ b/metagraph/src/cli/align.cpp @@ -105,11 +105,11 @@ std::unique_ptr build_aligner(const DeBruijnGraph &graph, // Use the seeder that seeds to node suffixes if (aligner_config.max_seed_length == k) { - return std::make_unique>>>( + return std::make_unique>>( graph, aligner_config ); } else { - return std::make_unique>>>( + return std::make_unique>>( graph, aligner_config ); } @@ -122,7 +122,7 @@ std::unique_ptr build_aligner(const DeBruijnGraph &graph, } else { // seeds are maximal matches within unitigs (uni-MEMs) - return std::make_unique>>(graph, aligner_config); + return std::make_unique>(graph, aligner_config); } } @@ -327,7 +327,7 @@ void gfa_map_files(const Config *config, } std::string format_alignment(std::string_view header, - const DBGAligner<>::DBGQueryAlignment &paths, + const QueryAlignment &paths, const DeBruijnGraph &graph, const Config &config) { std::string sout; @@ -356,9 +356,8 @@ std::string format_alignment(std::string_view header, } if (paths.empty()) { - Json::Value json_line = DBGAligner<>::DBGAlignment().to_json( - paths.get_query(), graph, secondary, header - ); + Json::Value json_line + = Alignment().to_json(paths.get_query(), graph, secondary, header); sout += fmt::format("{}\n", Json::writeString(builder, json_line)); } diff --git a/metagraph/src/graph/alignment/aligner_aggregator.hpp b/metagraph/src/graph/alignment/aligner_aggregator.hpp index 654006254c..c5e6c5a95a 100644 --- a/metagraph/src/graph/alignment/aligner_aggregator.hpp +++ b/metagraph/src/graph/alignment/aligner_aggregator.hpp @@ -10,11 +10,11 @@ namespace mtg { namespace graph { namespace align { -template +template class AlignmentAggregator { public: - typedef Alignment DBGAlignment; - typedef typename DBGAlignment::score_t score_t; + typedef Alignment::node_index node_index; + typedef Alignment::score_t score_t; AlignmentAggregator(std::string_view query, std::string_view rc_query, @@ -23,14 +23,14 @@ class AlignmentAggregator { assert(config_.num_alternative_paths); } - inline void add_alignment(DBGAlignment&& alignment); + inline void add_alignment(Alignment&& alignment); - inline score_t get_min_path_score(const DBGAlignment &seed) const; + inline score_t get_min_path_score(const Alignment &seed) const; - const DBGAlignment& maximum() const { return path_queue_.maximum(); } + const Alignment& maximum() const { return path_queue_.maximum(); } void pop_maximum() { path_queue_.pop_maximum(); } - void call_alignments(const std::function &callback); + void call_alignments(const std::function &callback); size_t size() const { return path_queue_.size(); } bool empty() const { return path_queue_.empty(); } @@ -39,37 +39,37 @@ class AlignmentAggregator { std::string_view query_; std::string_view rc_query_; const DBGAlignerConfig &config_; - boost::container::priority_deque, + boost::container::priority_deque, AlignmentCompare> path_queue_; + AlignmentCompare cmp_; }; -template -inline void AlignmentAggregator -::add_alignment(DBGAlignment&& alignment) { +template +inline void AlignmentAggregator::add_alignment(Alignment&& alignment) { if (path_queue_.size() < config_.num_alternative_paths) { path_queue_.emplace(std::move(alignment)); - } else if (!AlignmentCompare()(alignment, path_queue_.minimum())) { + } else if (!cmp_(alignment, path_queue_.minimum())) { path_queue_.update(path_queue_.begin(), std::move(alignment)); } } -template -inline auto AlignmentAggregator -::get_min_path_score(const DBGAlignment &) const -> score_t { +template +inline auto AlignmentAggregator +::get_min_path_score(const Alignment &) const -> score_t { return path_queue_.size() ? path_queue_.minimum().get_score() : config_.min_path_score; } -template -inline void AlignmentAggregator -::call_alignments(const std::function &callback) { +template +inline void AlignmentAggregator +::call_alignments(const std::function &callback) { if (path_queue_.empty()) return; while (path_queue_.size()) { - callback(DBGAlignment(path_queue_.maximum())); + callback(Alignment(path_queue_.maximum())); path_queue_.pop_maximum(); } } diff --git a/metagraph/src/graph/alignment/aligner_alignment.cpp b/metagraph/src/graph/alignment/aligner_alignment.cpp index 440fb5ee0c..5b1c748a81 100644 --- a/metagraph/src/graph/alignment/aligner_alignment.cpp +++ b/metagraph/src/graph/alignment/aligner_alignment.cpp @@ -14,14 +14,13 @@ namespace align { using mtg::common::logger; -template -Alignment::Alignment(std::string_view query, - std::vector&& nodes, - std::string&& sequence, - score_t score, - size_t clipping, - bool orientation, - size_t offset) +Alignment::Alignment(std::string_view query, + std::vector&& nodes, + std::string&& sequence, + score_t score, + size_t clipping, + bool orientation, + size_t offset) : query_(query), nodes_(std::move(nodes)), sequence_(std::move(sequence)), score_(score), orientation_(orientation), offset_(offset) { size_t min_length = std::min(query_.size(), sequence_.size()); @@ -43,8 +42,7 @@ Alignment::Alignment(std::string_view query, cigar_.append(Cigar::DELETION, sequence_.size() - min_length); } -template -void Alignment::append(Alignment&& other) { +void Alignment::append(Alignment&& other) { assert(query_.data() + query_.size() + other.get_clipping() == other.query_.data()); assert(orientation_ == other.orientation_); @@ -58,8 +56,7 @@ void Alignment::append(Alignment&& other) { query_ = std::string_view(query_.data(), other.query_.end() - query_.begin()); } -template -size_t Alignment::trim_offset() { +size_t Alignment::trim_offset() { if (!offset_ || nodes_.size() <= 1) return 0; @@ -69,9 +66,8 @@ size_t Alignment::trim_offset() { return trim; } -template -void Alignment::reverse_complement(const DeBruijnGraph &graph, - std::string_view query_rev_comp) { +void Alignment::reverse_complement(const DeBruijnGraph &graph, + std::string_view query_rev_comp) { assert(graph.get_mode() == DeBruijnGraph::CANONICAL); if (empty()) @@ -141,9 +137,9 @@ void Alignment::reverse_complement(const DeBruijnGraph &graph, } for (size_t i = num_first_steps; i < offset_; ++i) { - NodeType next_node = 0; + node_index next_node = 0; char last_char; - canonical->call_outgoing_kmers(nodes_[0], [&](NodeType next, char c) { + canonical->call_outgoing_kmers(nodes_[0], [&](node_index next, char c) { if (c == boss::BOSS::kSentinel) return; @@ -184,7 +180,7 @@ void Alignment::reverse_complement(const DeBruijnGraph &graph, // trim off ending from reverse complement (corresponding to the added prefix) for (size_t i = 0; i < offset_; ++i) { size_t indegree = 0; - graph.adjacent_incoming_nodes(nodes_[0], [&](NodeType prev) { + graph.adjacent_incoming_nodes(nodes_[0], [&](node_index prev) { ++indegree; // TODO: there are multiple possible reverse complements, which @@ -220,9 +216,7 @@ void Alignment::reverse_complement(const DeBruijnGraph &graph, // derived from: // https://github.com/maickrau/GraphAligner/blob/236e1cf0514cfa9104e9a3333cdc1c43209c3c5a/src/vg.proto -template -Json::Value Alignment::path_json(size_t node_size, - std::string_view label) const { +Json::Value Alignment::path_json(size_t node_size, std::string_view label) const { assert(nodes_.size()); Json::Value path; @@ -391,12 +385,11 @@ Json::Value Alignment::path_json(size_t node_size, return path; } -template -Json::Value Alignment::to_json(std::string_view full_query, - const DeBruijnGraph &graph, - bool is_secondary, - std::string_view read_name, - std::string_view label) const { +Json::Value Alignment::to_json(std::string_view full_query, + const DeBruijnGraph &graph, + bool is_secondary, + std::string_view read_name, + std::string_view label) const { assert(is_valid(graph)); // encode alignment @@ -471,8 +464,7 @@ Json::Value Alignment::to_json(std::string_view full_query, return alignment; } -template -std::shared_ptr Alignment +std::shared_ptr Alignment ::load_from_json(const Json::Value &alignment, const DeBruijnGraph &graph) { cigar_ = Cigar(); nodes_.clear(); @@ -602,9 +594,8 @@ bool spell_path(const DeBruijnGraph &graph, return true; } -template -bool Alignment::is_valid(const DeBruijnGraph &graph, - const DBGAlignerConfig *config) const { +bool Alignment::is_valid(const DeBruijnGraph &graph, + const DBGAlignerConfig *config) const { std::string path; if (!spell_path(graph, nodes_, path, offset_)) { std::cerr << *this << std::endl; @@ -636,9 +627,7 @@ bool Alignment::is_valid(const DeBruijnGraph &graph, } -template -QueryAlignment::QueryAlignment(std::string_view query, - bool is_reverse_complement) +QueryAlignment::QueryAlignment(std::string_view query, bool is_reverse_complement) : query_(new std::string()), query_rc_(new std::string()) { // pad sequences for easier access in 64-bit blocks query_->reserve(query.size() + 8); @@ -664,9 +653,6 @@ QueryAlignment::QueryAlignment(std::string_view query, } -template class Alignment<>; -template class QueryAlignment<>; - } // namespace align } // namespace graph } // namespace mtg diff --git a/metagraph/src/graph/alignment/aligner_alignment.hpp b/metagraph/src/graph/alignment/aligner_alignment.hpp index 439ae766d5..20eb4488e3 100644 --- a/metagraph/src/graph/alignment/aligner_alignment.hpp +++ b/metagraph/src/graph/alignment/aligner_alignment.hpp @@ -11,6 +11,7 @@ #include "aligner_cigar.hpp" #include "aligner_config.hpp" +#include "graph/representation/base/sequence_graph.hpp" namespace mtg { @@ -23,14 +24,13 @@ namespace align { // Note: this object stores pointers to the query sequence, so it is the user's // responsibility to ensure that the query sequence is not destroyed when // calling this class' methods -template class Alignment { public: - typedef NodeType node_index; + typedef DeBruijnGraph::node_index node_index; typedef DBGAlignerConfig::score_t score_t; Alignment(std::string_view query, - std::vector&& nodes = {}, + std::vector&& nodes = {}, std::string&& sequence = "", score_t score = 0, Cigar&& cigar = Cigar(), @@ -47,7 +47,7 @@ class Alignment { // Used for constructing seeds Alignment(std::string_view query = {}, - std::vector&& nodes = {}, + std::vector&& nodes = {}, score_t score = 0, size_t clipping = 0, bool orientation = false, @@ -65,7 +65,7 @@ class Alignment { // Used for constructing exact match seeds Alignment(std::string_view query, - std::vector&& nodes, + std::vector&& nodes, std::string&& sequence, score_t score, size_t clipping = 0, @@ -76,10 +76,10 @@ class Alignment { size_t size() const { return nodes_.size(); } bool empty() const { return nodes_.empty(); } - const std::vector& get_nodes() const { return nodes_; } - const NodeType& operator[](size_t i) const { return nodes_[i]; } - const NodeType& front() const { return nodes_.front(); } - const NodeType& back() const { return nodes_.back(); } + const std::vector& get_nodes() const { return nodes_; } + const node_index& operator[](size_t i) const { return nodes_[i]; } + const node_index& front() const { return nodes_.front(); } + const node_index& back() const { return nodes_.back(); } score_t get_score() const { return score_; } uint64_t get_num_matches() const { return cigar_.get_num_matches(); } @@ -117,8 +117,8 @@ class Alignment { Cigar::LengthType get_clipping() const { return cigar_.get_clipping(); } Cigar::LengthType get_end_clipping() const { return cigar_.get_end_clipping(); } - typedef typename std::vector::iterator iterator; - typedef typename std::vector::const_iterator const_iterator; + typedef typename std::vector::iterator iterator; + typedef typename std::vector::const_iterator const_iterator; const_iterator begin() const { return nodes_.cbegin(); } const_iterator end() const { return nodes_.cend(); } @@ -150,7 +150,7 @@ class Alignment { // TODO: rename to query_view_ std::string_view query_; - std::vector nodes_; + std::vector nodes_; std::string sequence_; score_t score_; Cigar cigar_; @@ -158,8 +158,7 @@ class Alignment { size_t offset_; }; -template -std::ostream& operator<<(std::ostream& out, const Alignment &alignment) { +inline std::ostream& operator<<(std::ostream& out, const Alignment &alignment) { out << (alignment.get_orientation() ? "-" : "+") << "\t" << alignment.get_sequence() << "\t" << alignment.get_score() << "\t" @@ -176,8 +175,7 @@ bool spell_path(const DeBruijnGraph &graph, size_t offset = 0); struct LocalAlignmentLess { - template - bool operator()(const Alignment &a, const Alignment &b) { + bool operator()(const Alignment &a, const Alignment &b) { // 1) score is less, or // 2) more of the query is covered, or // 3) if it is in the reverse orientation, or @@ -190,8 +188,7 @@ struct LocalAlignmentLess { }; struct LocalAlignmentGreater { - template - bool operator()(const Alignment &a, const Alignment &b) { + bool operator()(const Alignment &a, const Alignment &b) { // 1) score is higher, or // 2) less of the query is covered, or // 3) if it is in the forward orientation, or @@ -204,10 +201,9 @@ struct LocalAlignmentGreater { }; -template class QueryAlignment { public: - typedef typename std::vector>::const_iterator const_iterator; + typedef typename std::vector::const_iterator const_iterator; QueryAlignment(std::string_view query, bool is_reverse_complement = false); @@ -234,7 +230,7 @@ class QueryAlignment { return !reverse_complement ? *query_ : *query_rc_; } - const Alignment& operator[](size_t i) const { return alignments_[i]; } + const Alignment& operator[](size_t i) const { return alignments_[i]; } const_iterator begin() const { return alignments_.cbegin(); } const_iterator end() const { return alignments_.cend(); } const_iterator cbegin() const { return alignments_.cbegin(); } @@ -243,7 +239,7 @@ class QueryAlignment { private: std::shared_ptr query_; std::shared_ptr query_rc_; - std::vector> alignments_; + std::vector alignments_; }; } // namespace align diff --git a/metagraph/src/graph/alignment/aligner_extender_methods.cpp b/metagraph/src/graph/alignment/aligner_extender_methods.cpp index 34836bb86a..9fccd50495 100644 --- a/metagraph/src/graph/alignment/aligner_extender_methods.cpp +++ b/metagraph/src/graph/alignment/aligner_extender_methods.cpp @@ -12,14 +12,14 @@ namespace mtg { namespace graph { namespace align { +typedef DeBruijnGraph::node_index node_index; typedef DBGAlignerConfig::score_t score_t; constexpr score_t ninf = std::numeric_limits::min() + 100; -template -DefaultColumnExtender::DefaultColumnExtender(const DeBruijnGraph &graph, - const DBGAlignerConfig &config, - std::string_view query) +DefaultColumnExtender::DefaultColumnExtender(const DeBruijnGraph &graph, + const DBGAlignerConfig &config, + std::string_view query) : graph_(graph), config_(config), query_(query) { assert(config_.check_config_scores()); partial_sums_.reserve(query_.size() + 1); @@ -50,8 +50,7 @@ DefaultColumnExtender::DefaultColumnExtender(const DeBruijnGraph &grap } } -template -void DefaultColumnExtender::initialize(const DBGAlignment &seed) { +void DefaultColumnExtender::initialize(const Alignment &seed) { assert(seed.size()); assert(seed.get_cigar().size()); assert(seed.get_cigar().data().back().first == Cigar::MATCH @@ -85,8 +84,7 @@ std::pair get_band(const Node &prev, max_it - S_prev.begin() + offset_prev); } -template @@ -100,8 +98,8 @@ bool update_column(const DeBruijnGraph &graph_, score_t &xdrop_cutoff, const ProfileScore &profile_score_, const ProfileOp &profile_op_, - const Alignment &seed_) { - typedef DefaultColumnExtender Extender; + const Alignment &seed_) { + typedef DefaultColumnExtender Extender; auto &[S, E, F, OS, OE, OF, prev_node, PS, PF, offset, max_pos] = next_column; size_t cur_size = S.size(); @@ -297,9 +295,9 @@ bool update_column(const DeBruijnGraph &graph_, return updated; } -template +template void backtrack(const Table &table_, - const Alignment &seed_, + const Alignment &seed_, const DeBruijnGraph &graph_, const DBGAlignerConfig &config_, score_t min_path_score, @@ -308,13 +306,13 @@ void backtrack(const Table &table_, size_t size, std::string_view extend_window_, std::string_view query, - std::vector> &extensions) { - typedef DefaultColumnExtender Extender; + std::vector &extensions) { + typedef DefaultColumnExtender Extender; Cigar cigar; - std::vector path; + std::vector path; std::string seq; - NodeType start_node = DeBruijnGraph::npos; + node_index start_node = DeBruijnGraph::npos; assert(table_.count(std::get<0>(best_node))); const auto &[S, E, F, OS, OE, OF, prev, PS, PF, offset, max_pos] @@ -420,7 +418,7 @@ void backtrack(const Table &table_, std::reverse(path.begin(), path.end()); std::reverse(seq.begin(), seq.end()); - Alignment extension( + Alignment extension( { extend_window_.data() + pos, max_pos - pos }, std::move(path), std::move(seq), score, std::move(cigar), 0, seed_.get_orientation(), graph_.get_k() - 1 @@ -448,9 +446,7 @@ void backtrack(const Table &table_, } } -template -auto DefaultColumnExtender::get_extensions(score_t min_path_score) - -> std::vector { +auto DefaultColumnExtender::get_extensions(score_t min_path_score) -> std::vector { const char *align_start = seed_->get_query().data() + seed_->get_query().size() - 1; size_t start = align_start - query_.data(); size_t size = query_.size() - start + 1; @@ -474,7 +470,7 @@ auto DefaultColumnExtender::get_extensions(score_t min_path_score) auto &[S, E, F, OS, OE, OF, prev_node, PS, PF, offset, max_pos] = first_column; size_t num_columns = 1; - constexpr size_t column_vector_size = sizeof(std::pair>); + constexpr size_t column_vector_size = sizeof(std::pair>); auto get_column_size = [&](const Scores &scores) { size_t size = std::get<0>(scores).capacity(); @@ -544,7 +540,7 @@ auto DefaultColumnExtender::get_extensions(score_t min_path_score) min_i /* offset */, 0 /* max_pos */); sanitize(next_column); - bool updated = update_column( + bool updated = update_column( graph_, config_, column_prev, next_column, c, start, size, xdrop_cutoff, profile_score_, profile_op_, *seed_ ); @@ -602,13 +598,13 @@ auto DefaultColumnExtender::get_extensions(score_t min_path_score) return seed ^ (hasher2(std::get<2>(x)) + 0x9e3779b9 + (seed << 6) + (seed >> 2)); } - std::hash hasher1; + std::hash hasher1; std::hash hasher2; }; tsl::hopscotch_set prev_starts; - std::vector extensions; + std::vector extensions; for (const auto &[best_node, max_score] : starts) { if (prev_starts.count(best_node)) continue; @@ -630,8 +626,8 @@ auto DefaultColumnExtender::get_extensions(score_t min_path_score) } } else { assert(OS[max_pos - offset] == Cigar::MATCH); - backtrack(table_, *seed_, graph_, config_, min_path_score, best_node, - prev_starts, size, extend_window_, query_, extensions); + backtrack(table_, *seed_, graph_, config_, min_path_score, best_node, + prev_starts, size, extend_window_, query_, extensions); } assert(extensions.size() < 2 @@ -644,9 +640,8 @@ auto DefaultColumnExtender::get_extensions(score_t min_path_score) return extensions; } -template -void DefaultColumnExtender -::call_visited_nodes(const std::function &callback) const { +void DefaultColumnExtender +::call_visited_nodes(const std::function &callback) const { size_t window_start = extend_window_.data() - query_.data(); for (const auto &[node, columns] : table_) { size_t start = query_.size(); @@ -687,9 +682,7 @@ ::call_visited_nodes(const std::function &callba } } -template -bool DefaultColumnExtender::has_converged(const Column &column, - const Scores &next) { +bool DefaultColumnExtender::has_converged(const Column &column, const Scores &next) { if (column.second) return true; @@ -706,8 +699,7 @@ bool DefaultColumnExtender::has_converged(const Column &column, && PS == PS_b && PF == PF_b; } -template -void DefaultColumnExtender::sanitize(Scores &scores) { +void DefaultColumnExtender::sanitize(Scores &scores) { auto &[S, E, F, OS, OE, OF, prev, PS, PF, offset, max_pos] = scores; size_t size = S.size(); @@ -735,14 +727,13 @@ void DefaultColumnExtender::sanitize(Scores &scores) { memset(&PF[size], 0, sizeof(typename decltype(PF)::value_type) * size_diff); } -template -std::vector> DefaultColumnExtender -::get_outgoing(const AlignNode &node) const { - std::vector> outgoing; +auto DefaultColumnExtender::get_outgoing(const AlignNode &node) const + -> std::vector> { + std::vector> outgoing; if (std::get<0>(node) == graph_.max_index() + 1) { outgoing.emplace_back(seed_->back(), seed_->get_sequence().back()); } else { - graph_.call_outgoing_kmers(std::get<0>(node), [&](NodeType next, char c) { + graph_.call_outgoing_kmers(std::get<0>(node), [&](node_index next, char c) { if (c != boss::BOSS::kSentinel) outgoing.emplace_back(next, c); }); @@ -751,8 +742,6 @@ ::get_outgoing(const AlignNode &node) const { return outgoing; } -template class DefaultColumnExtender<>; - } // namespace align } // namespace graph } // namespace mtg diff --git a/metagraph/src/graph/alignment/aligner_extender_methods.hpp b/metagraph/src/graph/alignment/aligner_extender_methods.hpp index 95c9e61cc7..27b87ddf26 100644 --- a/metagraph/src/graph/alignment/aligner_extender_methods.hpp +++ b/metagraph/src/graph/alignment/aligner_extender_methods.hpp @@ -1,5 +1,5 @@ -#ifndef __DBG_ALIGNER_METHODS_HPP__ -#define __DBG_ALIGNER_METHODS_HPP__ +#ifndef __DBG_EXTENDER_METHODS_HPP__ +#define __DBG_EXTENDER_METHODS_HPP__ #include @@ -9,43 +9,33 @@ namespace mtg { namespace graph { - -class DBGSuccinct; - namespace align { -template class IExtender { public: - typedef Alignment DBGAlignment; - typedef typename DBGAlignment::node_index node_index; - typedef typename DBGAlignment::score_t score_t; + typedef DeBruijnGraph::node_index node_index; + typedef Alignment::score_t score_t; virtual ~IExtender() {} - virtual std::vector + virtual std::vector get_extensions(score_t min_path_score = std::numeric_limits::min()) = 0; - virtual void initialize(const DBGAlignment &seed) = 0; + virtual void initialize(const Alignment &seed) = 0; virtual void - call_visited_nodes(const std::function &callback) const = 0; protected: virtual void reset() = 0; - virtual const DBGAlignment& get_seed() const = 0; + virtual const Alignment& get_seed() const = 0; }; -template -class DefaultColumnExtender : public IExtender { +class DefaultColumnExtender : public IExtender { public: - typedef typename IExtender::DBGAlignment DBGAlignment; - typedef typename IExtender::node_index node_index; - typedef typename IExtender::score_t score_t; - enum NodeId : uint8_t { NONE, PREV, @@ -58,13 +48,13 @@ class DefaultColumnExtender : public IExtender { virtual ~DefaultColumnExtender() {} - virtual std::vector + virtual std::vector get_extensions(score_t min_path_score = std::numeric_limits::min()) override; - virtual void initialize(const DBGAlignment &seed) override; + virtual void initialize(const Alignment &seed) override; virtual void - call_visited_nodes(const std::function &callback) const override; @@ -73,7 +63,7 @@ class DefaultColumnExtender : public IExtender { const DBGAlignerConfig &config_; std::string_view query_; - typedef std::tuple AlignNode; @@ -88,13 +78,13 @@ class DefaultColumnExtender : public IExtender { size_t /* max_pos */> Scores; typedef std::pair, bool> Column; - tsl::hopscotch_map table_; + tsl::hopscotch_map table_; virtual void reset() override { table_.clear(); } - virtual const DBGAlignment& get_seed() const override { return *seed_; } + virtual const Alignment& get_seed() const override { return *seed_; } - virtual std::vector> get_outgoing(const AlignNode &node) const; + virtual std::vector> get_outgoing(const AlignNode &node) const; private: // compute perfect match scores for all suffixes @@ -106,7 +96,7 @@ class DefaultColumnExtender : public IExtender { tsl::hopscotch_map> profile_op_; // the initial seed - const DBGAlignment *seed_; + const Alignment *seed_; std::string_view extend_window_; @@ -122,4 +112,4 @@ class DefaultColumnExtender : public IExtender { } // namespace graph } // namespace mtg -#endif // __DBG_ALIGNER_METHODS_HPP__ +#endif // __DBG_EXTENDER_METHODS_HPP__ diff --git a/metagraph/src/graph/alignment/aligner_seeder_methods.cpp b/metagraph/src/graph/alignment/aligner_seeder_methods.cpp index 82ebab472e..4e143ba615 100644 --- a/metagraph/src/graph/alignment/aligner_seeder_methods.cpp +++ b/metagraph/src/graph/alignment/aligner_seeder_methods.cpp @@ -10,14 +10,11 @@ namespace mtg { namespace graph { namespace align { -typedef DBGAlignerConfig::score_t score_t; - -template -ExactSeeder::ExactSeeder(const DeBruijnGraph &graph, - std::string_view query, - bool orientation, - std::vector&& nodes, - const DBGAlignerConfig &config) +ExactSeeder::ExactSeeder(const DeBruijnGraph &graph, + std::string_view query, + bool orientation, + std::vector&& nodes, + const DBGAlignerConfig &config) : graph_(graph), query_(query), orientation_(orientation), @@ -30,7 +27,7 @@ ExactSeeder::ExactSeeder(const DeBruijnGraph &graph, size_t last_match_count = 0; for (auto it = query_nodes_.begin(); it != query_nodes_.end(); ++it) { if (*it) { - auto jt = std::find(it + 1, query_nodes_.end(), NodeType()); + auto jt = std::find(it + 1, query_nodes_.end(), node_index{}); num_matching_ += graph_.get_k() + std::distance(it, jt) - 1 - last_match_count; last_match_count = graph_.get_k(); it = jt - 1; @@ -51,35 +48,26 @@ ExactSeeder::ExactSeeder(const DeBruijnGraph &graph, assert(!partial_sum_.front()); } -template -auto ExactSeeder::get_seeds() const -> std::vector { - const DeBruijnGraph &graph = this->graph_; - size_t k = graph.get_k(); +auto ExactSeeder::get_seeds() const -> std::vector { + size_t k = graph_.get_k(); + assert(k >= config_.min_seed_length); - const DBGAlignerConfig &config = this->config_; - assert(k >= config.min_seed_length); - - const std::vector &query_nodes = this->query_nodes_; - const std::vector &partial_sum = this->partial_sum_; - std::string_view query = this->query_; - bool orientation = this->orientation_; - - if (this->num_matching_ < config.min_exact_match * query.size()) + if (num_matching_ < config_.min_exact_match * query_.size()) return {}; std::vector seeds; - for (size_t i = 0; i < query_nodes.size(); ++i) { - if (query_nodes[i] != DeBruijnGraph::npos) { - assert(i + k <= query.size()); + for (size_t i = 0; i < query_nodes_.size(); ++i) { + if (query_nodes_[i] != DeBruijnGraph::npos) { + assert(i + k <= query_.size()); - score_t match_score = partial_sum[i + k] - partial_sum[i]; + score_t match_score = partial_sum_[i + k] - partial_sum_[i]; - if (match_score > config.min_cell_score) { - seeds.emplace_back(query.substr(i, k), - std::vector{ query_nodes[i] }, - match_score, i, orientation); - assert(seeds.back().is_valid(graph, &config)); + if (match_score > config_.min_cell_score) { + seeds.emplace_back(query_.substr(i, k), + std::vector{ query_nodes_[i] }, + match_score, i, orientation_); + assert(seeds.back().is_valid(graph_, &config_)); } } } @@ -136,7 +124,7 @@ void suffix_to_prefix(const DBGSuccinct &dbg_succ, template auto SuffixSeeder::get_seeds() const -> std::vector { // this method assumes that seeds from the BaseSeeder are exact match only - static_assert(std::is_base_of_v, BaseSeeder>); + static_assert(std::is_base_of_v); std::vector> suffix_seeds( this->query_.size() - this->config_.min_seed_length + 1 @@ -167,7 +155,7 @@ auto SuffixSeeder::get_seeds() const -> std::vector { assert(i < suffix_seeds.size()); std::string_view seed_seq = this->query_.substr(i, seed_length); - DBGAlignerConfig::score_t match_score = this->config_.match_score(seed_seq); + score_t match_score = this->config_.match_score(seed_seq); if (match_score <= this->config_.min_cell_score) return; @@ -335,19 +323,18 @@ ::get_base_dbg_succ(const DeBruijnGraph &graph) { : dynamic_cast(graph); } -template -auto MEMSeeder::get_seeds() const -> std::vector { - size_t k = this->graph_.get_k(); +auto MEMSeeder::get_seeds() const -> std::vector { + size_t k = graph_.get_k(); - if (this->num_matching_ < this->config_.min_exact_match * this->query_.size()) + if (num_matching_ < config_.min_exact_match * query_.size()) return {}; - std::vector query_node_flags(this->query_nodes_.size(), 0); + std::vector query_node_flags(query_nodes_.size(), 0); for (size_t i = 0; i < query_node_flags.size(); ++i) { - if (this->query_nodes_[i] != DeBruijnGraph::npos) { + if (query_nodes_[i] != DeBruijnGraph::npos) { // the second bit indicates that a node has been found, while the // first bit indicates if the node is a maximal exact match terminus - query_node_flags[i] = 2 | get_mem_terminator()[this->query_nodes_[i]]; + query_node_flags[i] = 2 | get_mem_terminator()[query_nodes_[i]]; } } @@ -372,27 +359,26 @@ auto MEMSeeder::get_seeds() const -> std::vector { size_t i = it - query_node_flags.begin(); assert(it == query_node_flags.end() - || this->query_nodes_[i] != DeBruijnGraph::npos); + || query_nodes_[i] != DeBruijnGraph::npos); size_t mem_length = (next - it) + k - 1; - assert(i + mem_length <= this->query_.size()); + assert(i + mem_length <= query_.size()); - if (mem_length >= this->config_.min_seed_length) { - const char *begin_it = this->query_.data() + i; + if (mem_length >= config_.min_seed_length) { + const char *begin_it = query_.data() + i; const char *end_it = begin_it + mem_length; - score_t match_score = this->partial_sum_[end_it - this->query_.data()] - - this->partial_sum_[i]; + score_t match_score = partial_sum_[end_it - query_.data()] - partial_sum_[i]; - auto node_begin_it = this->query_nodes_.begin() + i; + auto node_begin_it = query_nodes_.begin() + i; auto node_end_it = node_begin_it + (next - it); assert(std::find(node_begin_it, node_end_it, DeBruijnGraph::npos) == node_end_it); - if (match_score > this->config_.min_cell_score) { + if (match_score > config_.min_cell_score) { seeds.emplace_back(std::string_view(begin_it, mem_length), - std::vector{ node_begin_it, node_end_it }, - match_score, i,this->orientation_); - assert(seeds.back().is_valid(this->graph_, &this->config_)); + std::vector{ node_begin_it, node_end_it }, + match_score, i,orientation_); + assert(seeds.back().is_valid(graph_, &config_)); } } @@ -403,11 +389,8 @@ auto MEMSeeder::get_seeds() const -> std::vector { } -template class ExactSeeder<>; -template class MEMSeeder<>; -template class UniMEMSeeder<>; -template class SuffixSeeder>; -template class SuffixSeeder>; +template class SuffixSeeder; +template class SuffixSeeder; } // namespace align } // namespace graph diff --git a/metagraph/src/graph/alignment/aligner_seeder_methods.hpp b/metagraph/src/graph/alignment/aligner_seeder_methods.hpp index 3b6977af60..32da1c060e 100644 --- a/metagraph/src/graph/alignment/aligner_seeder_methods.hpp +++ b/metagraph/src/graph/alignment/aligner_seeder_methods.hpp @@ -12,22 +12,18 @@ class DBGSuccinct; namespace align { -template class ISeeder { public: - typedef Alignment Seed; + typedef DeBruijnGraph::node_index node_index; + typedef Alignment Seed; virtual ~ISeeder() {} virtual std::vector get_seeds() const = 0; }; -template -class ManualSeeder : public ISeeder { +class ManualSeeder : public ISeeder { public: - typedef NodeType node_index; - typedef Alignment Seed; - ManualSeeder(std::vector&& seeds) : seeds_(std::move(seeds)) {} virtual ~ManualSeeder() {} @@ -38,16 +34,14 @@ class ManualSeeder : public ISeeder { std::vector seeds_; }; -template -class ExactSeeder : public ISeeder { +class ExactSeeder : public ISeeder { public: - typedef NodeType node_index; - typedef typename ISeeder::Seed Seed; + typedef DBGAlignerConfig::score_t score_t; ExactSeeder(const DeBruijnGraph &graph, std::string_view query, bool orientation, - std::vector&& nodes, + std::vector&& nodes, const DBGAlignerConfig &config); virtual ~ExactSeeder() {} @@ -58,19 +52,16 @@ class ExactSeeder : public ISeeder { const DeBruijnGraph &graph_; std::string_view query_; bool orientation_; - std::vector query_nodes_; + std::vector query_nodes_; const DBGAlignerConfig &config_; - std::vector partial_sum_; + std::vector partial_sum_; size_t num_matching_; }; -template -class MEMSeeder : public ExactSeeder { +class MEMSeeder : public ExactSeeder { public: - typedef typename ISeeder::Seed Seed; - template - MEMSeeder(Args&&... args) : ExactSeeder(std::forward(args)...) {} + MEMSeeder(Args&&... args) : ExactSeeder(std::forward(args)...) {} virtual ~MEMSeeder() {} @@ -79,21 +70,17 @@ class MEMSeeder : public ExactSeeder { virtual const bitmap& get_mem_terminator() const = 0; }; -template -class UniMEMSeeder : public MEMSeeder { +class UniMEMSeeder : public MEMSeeder { public: - typedef NodeType node_index; - typedef typename ISeeder::Seed Seed; - template UniMEMSeeder(Args&&... args) - : MEMSeeder(std::forward(args)...), + : MEMSeeder(std::forward(args)...), is_mem_terminus_([&](auto i) { - return this->graph_.has_multiple_outgoing(i) - || this->graph_.indegree(i) > 1; + return graph_.has_multiple_outgoing(i) + || graph_.indegree(i) > 1; }, - this->graph_.max_index() + 1) { - assert(is_mem_terminus_.size() == this->graph_.max_index() + 1); + graph_.max_index() + 1) { + assert(is_mem_terminus_.size() == graph_.max_index() + 1); } virtual ~UniMEMSeeder() {} @@ -107,8 +94,9 @@ class UniMEMSeeder : public MEMSeeder { template class SuffixSeeder : public BaseSeeder { public: - typedef typename BaseSeeder::node_index node_index; typedef typename BaseSeeder::Seed Seed; + typedef typename BaseSeeder::node_index node_index; + typedef typename BaseSeeder::score_t score_t; template SuffixSeeder(Args&&... args) diff --git a/metagraph/src/graph/alignment/dbg_aligner.cpp b/metagraph/src/graph/alignment/dbg_aligner.cpp index 1f0e2e5c0d..5a46bf6b33 100644 --- a/metagraph/src/graph/alignment/dbg_aligner.cpp +++ b/metagraph/src/graph/alignment/dbg_aligner.cpp @@ -6,15 +6,14 @@ namespace mtg { namespace graph { namespace align { -IDBGAligner::DBGQueryAlignment IDBGAligner::align(std::string_view query, - bool is_reverse_complement) const { - DBGQueryAlignment result(query); +QueryAlignment IDBGAligner::align(std::string_view query, bool is_reverse_complement) const { + QueryAlignment result(query); std::string empty_header; align_batch( [&](const QueryCallback &callback) { callback(empty_header, query, is_reverse_complement); }, - [&](std::string_view, DBGQueryAlignment&& alignment) { + [&](std::string_view, QueryAlignment&& alignment) { result = std::move(alignment); } ); @@ -35,16 +34,16 @@ ::align_batch(const std::vector> &seq_batch, template void SeedAndExtendAlignerCore ::align_core(std::string_view query, - const ISeeder &seeder, - IExtender &extender, + const ISeeder &seeder, + IExtender &extender, const LocalAlignmentCallback &callback, const MinScoreComputer &get_min_path_score) const { - bool filter_seeds = dynamic_cast*>(&seeder); + bool filter_seeds = dynamic_cast(&seeder); - std::vector seeds = seeder.get_seeds(); + std::vector seeds = seeder.get_seeds(); std::sort(seeds.begin(), seeds.end(), LocalAlignmentGreater()); - for (DBGAlignment &seed : seeds) { + for (Alignment &seed : seeds) { score_t min_path_score = get_min_path_score(seed); // check if this seed has been explored before in an alignment and discard @@ -127,10 +126,10 @@ ::align_core(std::string_view query, template void SeedAndExtendAlignerCore -::align_one_direction(DBGQueryAlignment &paths, +::align_one_direction(QueryAlignment &paths, bool orientation_to_align, - const ISeeder &seeder, - IExtender&& extender) const { + const ISeeder &seeder, + IExtender&& extender) const { std::string_view query = paths.get_query(orientation_to_align); align_aggregate(paths, [&](const auto &alignment_callback, @@ -141,11 +140,11 @@ ::align_one_direction(DBGQueryAlignment &paths, template void SeedAndExtendAlignerCore -::align_best_direction(DBGQueryAlignment &paths, - const ISeeder &seeder, - const ISeeder &seeder_rc, - IExtender&& extender, - IExtender&& extender_rc) const { +::align_best_direction(QueryAlignment &paths, + const ISeeder &seeder, + const ISeeder &seeder_rc, + IExtender&& extender, + IExtender&& extender_rc) const { std::string_view forward = paths.get_query(); std::string_view reverse = paths.get_query(true); @@ -158,11 +157,11 @@ ::align_best_direction(DBGQueryAlignment &paths, template void SeedAndExtendAlignerCore -::align_both_directions(DBGQueryAlignment &paths, - const ISeeder &forward_seeder, - const ISeeder &reverse_seeder, - IExtender&& forward_extender, - IExtender&& reverse_extender, +::align_both_directions(QueryAlignment &paths, + const ISeeder &forward_seeder, + const ISeeder &reverse_seeder, + IExtender&& forward_extender, + IExtender&& reverse_extender, const AlignCoreGenerator &rev_comp_core_generator) const { std::string_view forward = paths.get_query(); std::string_view reverse = paths.get_query(true); @@ -171,16 +170,16 @@ ::align_both_directions(DBGQueryAlignment &paths, const auto &get_min_path_score) { auto get_forward_alignments = [&](std::string_view query, std::string_view query_rc, - const ISeeder &seeder, - IExtender &extender) { - std::vector rc_of_alignments; + const ISeeder &seeder, + IExtender &extender) { + std::vector rc_of_alignments; DEBUG_LOG("Extending in forwards direction"); - align_core(query, seeder, extender, [&](DBGAlignment&& path) { + align_core(query, seeder, extender, [&](Alignment&& path) { score_t min_path_score = get_min_path_score(path); if (path.get_score() >= min_path_score) - alignment_callback(DBGAlignment(path)); + alignment_callback(Alignment(path)); if (!path.get_clipping()) return; @@ -209,16 +208,16 @@ ::align_both_directions(DBGQueryAlignment &paths, return rc_of_alignments; }; - std::vector rc_of_reverse = get_forward_alignments( + std::vector rc_of_reverse = get_forward_alignments( reverse, forward, reverse_seeder, reverse_extender ); - std::vector rc_of_forward = get_forward_alignments( + std::vector rc_of_forward = get_forward_alignments( forward, reverse, forward_seeder, forward_extender ); auto extend_reverse = [&](std::string_view query_rc, - const ISeeder &seeder, - std::vector&& rc_of_alignments) { + const ISeeder &seeder, + std::vector&& rc_of_alignments) { DEBUG_LOG("Extending in reverse direction"); rev_comp_core_generator(query_rc, seeder, std::move(rc_of_alignments), [&](const auto &seeder_rc, auto&& extender_rc) { @@ -234,17 +233,17 @@ ::align_both_directions(DBGQueryAlignment &paths, template void SeedAndExtendAlignerCore -::align_aggregate(DBGQueryAlignment &paths, +::align_aggregate(QueryAlignment &paths, const AlignmentGenerator &alignment_generator) const { - AlignmentAggregator path_queue( + AlignmentAggregator path_queue( paths.get_query() /* forward */, paths.get_query(true) /* reverse complement */, config_ ); alignment_generator( - [&](DBGAlignment&& alignment) { path_queue.add_alignment(std::move(alignment)); }, - [&](const DBGAlignment &seed) { return path_queue.get_min_path_score(seed); } + [&](Alignment&& alignment) { path_queue.add_alignment(std::move(alignment)); }, + [&](const Alignment &seed) { return path_queue.get_min_path_score(seed); } ); path_queue.call_alignments([&](auto&& alignment) { diff --git a/metagraph/src/graph/alignment/dbg_aligner.hpp b/metagraph/src/graph/alignment/dbg_aligner.hpp index 78edbb78b7..739cc873fc 100644 --- a/metagraph/src/graph/alignment/dbg_aligner.hpp +++ b/metagraph/src/graph/alignment/dbg_aligner.hpp @@ -20,16 +20,14 @@ namespace align { class IDBGAligner { public: typedef DeBruijnGraph::node_index node_index; - typedef Alignment DBGAlignment; - typedef QueryAlignment DBGQueryAlignment; - typedef typename DBGAlignment::score_t score_t; + typedef Alignment::score_t score_t; typedef std::function QueryCallback; typedef std::function QueryGenerator; typedef std::function AlignmentCallback; + QueryAlignment&& /* alignments */)> AlignmentCallback; virtual ~IDBGAligner() {} @@ -38,8 +36,8 @@ class IDBGAligner { const AlignmentCallback &callback) const = 0; // Convenience methods - DBGQueryAlignment align(std::string_view query, - bool is_reverse_complement = false) const; + QueryAlignment align(std::string_view query, + bool is_reverse_complement = false) const; void align_batch(const std::vector> &seq_batch, const AlignmentCallback &callback) const; }; @@ -54,18 +52,11 @@ class ISeedAndExtendAligner : public IDBGAligner { template class SeedAndExtendAlignerCore; -template , - class Extender = DefaultColumnExtender<>, +template class DBGAligner : public ISeedAndExtendAligner { public: - typedef IDBGAligner::node_index node_index; - typedef IDBGAligner::DBGAlignment DBGAlignment; - typedef IDBGAligner::DBGQueryAlignment DBGQueryAlignment; - typedef IDBGAligner::score_t score_t; - typedef IDBGAligner::QueryGenerator QueryGenerator; - typedef IDBGAligner::AlignmentCallback AlignmentCallback; - DBGAligner(const DeBruijnGraph &graph, const DBGAlignerConfig &config) : graph_(graph), config_(config) { assert(config_.num_alternative_paths); @@ -88,20 +79,17 @@ template class SeedAndExtendAlignerCore { public: typedef IDBGAligner::node_index node_index; - typedef IDBGAligner::DBGAlignment DBGAlignment; - typedef IDBGAligner::DBGQueryAlignment DBGQueryAlignment; typedef IDBGAligner::score_t score_t; - typedef std::function LocalAlignmentCallback; - typedef std::function MinScoreComputer; + typedef std::function LocalAlignmentCallback; + typedef std::function MinScoreComputer; typedef const std::function AlignmentGenerator; - typedef std::function&, - IExtender&&)> AlignCoreCallback; + typedef std::function AlignCoreCallback; typedef std::function & /* forward seeder */, - std::vector&& /* rev_comp_seeds */, + const ISeeder& /* forward seeder */, + std::vector&& /* rev_comp_seeds */, const AlignCoreCallback&)> AlignCoreGenerator; SeedAndExtendAlignerCore(const DeBruijnGraph &graph, const DBGAlignerConfig &config) @@ -109,18 +97,18 @@ class SeedAndExtendAlignerCore { // Align the query sequence in the given orientation (false is forward, // true is reverse complement) - void align_one_direction(DBGQueryAlignment &paths, + void align_one_direction(QueryAlignment &paths, bool orientation_to_align, - const ISeeder &seeder, - IExtender&& extender) const; + const ISeeder &seeder, + IExtender&& extender) const; // Align both the forward and reverse complement of the query sequence, // then report the best scoring alignment. - void align_best_direction(DBGQueryAlignment &paths, - const ISeeder &seeder, - const ISeeder &seeder_rc, - IExtender&& extender, - IExtender&& extender_rc) const; + void align_best_direction(QueryAlignment &paths, + const ISeeder &seeder, + const ISeeder &seeder_rc, + IExtender&& extender, + IExtender&& extender_rc) const; // Align the forward and reverse complement of the query sequence in both // directions and return the overall best alignment. e.g., for the forward query @@ -128,25 +116,25 @@ class SeedAndExtendAlignerCore { // 2. Given a seed, extend forwards to get alignment A // 3. Reverse complement the alignment to get A', treat it like a new seed // 4. Extend A' forwards to get the final alignment A'' - void align_both_directions(DBGQueryAlignment &paths, - const ISeeder &forward_seeder, - const ISeeder &reverse_seeder, - IExtender&& forward_extender, - IExtender&& reverse_extender, + void align_both_directions(QueryAlignment &paths, + const ISeeder &forward_seeder, + const ISeeder &reverse_seeder, + IExtender&& forward_extender, + IExtender&& reverse_extender, const AlignCoreGenerator &rev_comp_core_generator) const; protected: // Generate seeds, then extend them void align_core(std::string_view query, - const ISeeder &seeder, - IExtender &extender, + const ISeeder &seeder, + IExtender &extender, const LocalAlignmentCallback &callback, const MinScoreComputer &get_min_path_score) const; // Given alignments generated by a generator, add them to a priority queue // and add the top ones to paths. virtual void - align_aggregate(DBGQueryAlignment &paths, + align_aggregate(QueryAlignment &paths, const AlignmentGenerator &alignment_generator) const; const DeBruijnGraph &graph_; @@ -164,7 +152,7 @@ ::align_batch(const QueryGenerator &generate_query, std::string_view query, bool is_reverse_complement) { SeedAndExtendAlignerCore aligner_core(graph_, config_); - DBGQueryAlignment paths(query, is_reverse_complement); + QueryAlignment paths(query, is_reverse_complement); std::string_view this_query = paths.get_query(is_reverse_complement); assert(this_query == query); @@ -181,7 +169,7 @@ ::align_batch(const QueryGenerator &generate_query, const auto &, auto&& rev_comp_seeds, const auto &callback) { - ManualSeeder seeder_rc(std::move(rev_comp_seeds)); + ManualSeeder seeder_rc(std::move(rev_comp_seeds)); callback(seeder_rc, Extender(graph_, config_, reverse)); }; diff --git a/metagraph/tests/graph/test_aligner.cpp b/metagraph/tests/graph/test_aligner.cpp index b4eda117a9..866e887c35 100644 --- a/metagraph/tests/graph/test_aligner.cpp +++ b/metagraph/tests/graph/test_aligner.cpp @@ -22,7 +22,7 @@ using namespace mtg::kmer; const std::string test_data_dir = "../tests/data"; const bool PICK_REV_COMP = true; -inline bool is_exact_match(const Alignment &alignment) { +inline bool is_exact_match(const Alignment &alignment) { return alignment.get_cigar().is_exact_match(alignment.get_query().size()); } @@ -287,7 +287,7 @@ TYPED_TEST(DBGAlignerTest, align_straight_forward_and_reverse_complement) { ext_paths.begin(), ext_paths.end())); // test copy - auto paths_copy = const_cast::DBGQueryAlignment&>(paths); + auto paths_copy = const_cast(paths); for (const auto &path : paths_copy) { EXPECT_TRUE(path.is_valid(*graph, &config)); } @@ -1313,7 +1313,7 @@ TEST(DBGAlignerTest, align_suffix_seed_snp_min_seed_length) { config.min_cell_score = std::numeric_limits::min() + 100; config.min_path_score = std::numeric_limits::min() + 100; config.max_seed_length = k; - DBGAligner>> aligner(*graph, config); + DBGAligner> aligner(*graph, config); auto paths = aligner.align(query); ASSERT_EQ(1ull, paths.size()); auto path = paths[0]; @@ -1347,7 +1347,7 @@ TEST(DBGAlignerTest, align_suffix_seed_snp_min_seed_length) { config.min_cell_score = std::numeric_limits::min() + 100; config.min_path_score = std::numeric_limits::min() + 100; config.max_seed_length = k; - DBGAligner>> aligner(*graph, config); + DBGAligner> aligner(*graph, config); auto paths = aligner.align(query); ASSERT_EQ(1ull, paths.size()); auto path = paths[0]; @@ -1395,7 +1395,7 @@ TEST(DBGAlignerTest, align_suffix_seed_snp_canonical) { config.min_path_score = std::numeric_limits::min() + 100; config.max_seed_length = k; config.min_seed_length = 13; - DBGAligner>> aligner(*graph, config); + DBGAligner> aligner(*graph, config); auto paths = aligner.align(query); ASSERT_EQ(1ull, paths.size()); auto path = paths[0]; @@ -1525,7 +1525,7 @@ TEST(DBGAlignerTest, align_dummy) { config.max_seed_length = k; graph->add_sequence(reference); - DBGAligner>> aligner(*graph, config); + DBGAligner> aligner(*graph, config); auto paths = aligner.align(query); ASSERT_EQ(1ull, paths.size()); auto path = paths[0]; @@ -1557,7 +1557,7 @@ TEST(DBGAlignerTest, align_extended_insert_after_match) { graph->add_sequence(reference_1); graph->add_sequence(reference_2); - DBGAligner>> aligner(*graph, config); + DBGAligner> aligner(*graph, config); auto paths = aligner.align(query); ASSERT_EQ(1ull, paths.size()); auto path = paths[0]; @@ -1587,7 +1587,7 @@ TEST(DBGAlignerTest, align_suffix_seed_no_full_seeds) { for (size_t max_seed_length : { k, k + 100 }) { config.max_seed_length = max_seed_length; - DBGAligner>> aligner(*graph, config); + DBGAligner> aligner(*graph, config); auto paths = aligner.align(query); ASSERT_EQ(1ull, paths.size()); auto path = paths[0]; diff --git a/metagraph/tests/graph/test_aligner_helpers.hpp b/metagraph/tests/graph/test_aligner_helpers.hpp index f1be4bb099..acbbc51739 100644 --- a/metagraph/tests/graph/test_aligner_helpers.hpp +++ b/metagraph/tests/graph/test_aligner_helpers.hpp @@ -25,9 +25,8 @@ inline int8_t single_char_score(const DBGAlignerConfig &config, char a, int8_t b return config.get_row(a)[b]; } -template void check_json_dump_load(const DeBruijnGraph &graph, - const Alignment &alignment, + const Alignment &alignment, const std::string &query, const std::string &rc_query = "") { ASSERT_TRUE(!rc_query.size() || query.size() == rc_query.size()); @@ -40,7 +39,7 @@ void check_json_dump_load(const DeBruijnGraph &graph, alignment.get_query().size()), alignment.get_query()); - Alignment load_alignment; + Alignment load_alignment; auto load_sequence = load_alignment.load_from_json( alignment.to_json(path_query, graph), graph @@ -63,23 +62,23 @@ void check_json_dump_load(const DeBruijnGraph &graph, << load_alignment.get_query() << "\n"; } -DBGAligner<>::DBGQueryAlignment get_extend(std::shared_ptr graph, - const DBGAlignerConfig &config, - const DBGAligner<>::DBGQueryAlignment &paths, - const std::string &query) { +QueryAlignment get_extend(std::shared_ptr graph, + const DBGAlignerConfig &config, + const QueryAlignment &paths, + const std::string &query) { assert(graph.get()); EXPECT_EQ(query, paths.get_query()); auto uniconfig = config; uniconfig.max_seed_length = std::numeric_limits::max(); return std::dynamic_pointer_cast(graph) - ? DBGAligner>>(*graph, uniconfig).align(query) - : DBGAligner>(*graph, uniconfig).align(query); + ? DBGAligner>(*graph, uniconfig).align(query) + : DBGAligner(*graph, uniconfig).align(query); } inline void check_extend(std::shared_ptr graph, const DBGAlignerConfig &config, - const DBGAligner<>::DBGQueryAlignment &paths, + const QueryAlignment &paths, const std::string &query) { auto unimem_paths = get_extend(graph, config, paths, query); From c083a6e6743acf9a1ebecbaadd222dffbe49d220 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Sun, 25 Jul 2021 05:03:07 +0200 Subject: [PATCH 07/35] minor fix when returning multiple alignments --- metagraph/src/graph/alignment/aligner_aggregator.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/metagraph/src/graph/alignment/aligner_aggregator.hpp b/metagraph/src/graph/alignment/aligner_aggregator.hpp index c5e6c5a95a..2e9065fb8b 100644 --- a/metagraph/src/graph/alignment/aligner_aggregator.hpp +++ b/metagraph/src/graph/alignment/aligner_aggregator.hpp @@ -58,8 +58,9 @@ inline void AlignmentAggregator::add_alignment(Alignment&& ali template inline auto AlignmentAggregator ::get_min_path_score(const Alignment &) const -> score_t { - return path_queue_.size() ? path_queue_.minimum().get_score() - : config_.min_path_score; + return path_queue_.size() < config_.num_alternative_paths + ? config_.min_path_score + : path_queue_.minimum().get_score(); } template From 9f4e8e5b73571d874dd93eedc53079f71615023c Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Mon, 26 Jul 2021 18:29:37 +0200 Subject: [PATCH 08/35] Added DBGWrapper as an abstract parent class for all DeBruijnGraph wrappers (#340) * Added DBGWrapper as an abstract parent class for all DeBruijnGraph wrappers * simplify test_dbg_helpers * added get_base_graph as a convenience function --- .../src/cli/load/load_annotated_graph.cpp | 2 + metagraph/src/graph/annotated_dbg.cpp | 9 +- metagraph/src/graph/annotated_dbg.hpp | 2 + .../graph/representation/base/dbg_wrapper.hpp | 97 ++++++++++ .../representation/base/sequence_graph.hpp | 2 + .../graph/representation/canonical_dbg.cpp | 171 ++++++++---------- .../graph/representation/canonical_dbg.hpp | 149 ++++++--------- .../src/graph/representation/masked_graph.cpp | 155 +++++++--------- .../src/graph/representation/masked_graph.hpp | 75 ++++---- .../tests/graph/all/test_dbg_helpers.cpp | 140 +++----------- metagraph/tests/graph/test_aligner.cpp | 11 +- 11 files changed, 374 insertions(+), 439 deletions(-) create mode 100644 metagraph/src/graph/representation/base/dbg_wrapper.hpp diff --git a/metagraph/src/cli/load/load_annotated_graph.cpp b/metagraph/src/cli/load/load_annotated_graph.cpp index db8cbd7535..f5753ed409 100644 --- a/metagraph/src/cli/load/load_annotated_graph.cpp +++ b/metagraph/src/cli/load/load_annotated_graph.cpp @@ -22,6 +22,8 @@ using mtg::common::logger; std::unique_ptr initialize_annotated_dbg(std::shared_ptr graph, const Config &config) { + assert(graph.get() == &graph->get_base_graph()); + uint64_t max_index = graph->max_index(); const auto *dbg_graph = dynamic_cast(graph.get()); diff --git a/metagraph/src/graph/annotated_dbg.cpp b/metagraph/src/graph/annotated_dbg.cpp index bddd5e89d6..417ecec999 100644 --- a/metagraph/src/graph/annotated_dbg.cpp +++ b/metagraph/src/graph/annotated_dbg.cpp @@ -3,7 +3,6 @@ #include #include -#include "graph/representation/canonical_dbg.hpp" #include "annotation/representation/row_compressed/annotate_row_compressed.hpp" #include "annotation/int_matrix/base/int_matrix.hpp" #include "common/utils/simd_utils.hpp" @@ -689,13 +688,13 @@ ::call_annotated_nodes(const Label &label, } bool AnnotatedSequenceGraph::check_compatibility() const { - // TODO: add method max_canonical_index() and call it here without casts - if (const auto *canonical = dynamic_cast(graph_.get())) - return canonical->get_graph().max_index() == annotator_->num_objects(); - return graph_->max_index() == annotator_->num_objects(); } +bool AnnotatedDBG::check_compatibility() const { + return dbg_.get_base_graph().max_index() == annotator_->num_objects(); +} + /** * Helper functions for score_kmer_presence_mask diff --git a/metagraph/src/graph/annotated_dbg.hpp b/metagraph/src/graph/annotated_dbg.hpp index 1ffd58d4d8..edb1884afb 100644 --- a/metagraph/src/graph/annotated_dbg.hpp +++ b/metagraph/src/graph/annotated_dbg.hpp @@ -78,6 +78,8 @@ class AnnotatedDBG : public AnnotatedSequenceGraph { const DeBruijnGraph& get_graph() const { return dbg_; } + bool check_compatibility() const; + // add k-mer counts to the annotation, thread-safe for concurrent calls void add_kmer_counts(std::string_view sequence, const std::vector