diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1cf0e88cca..c7536c1f44 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -2,6 +2,11 @@ name: MetaGraph CI on: [push] +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + + jobs: Linux: @@ -173,6 +178,79 @@ jobs: - name: run integration tests run: cd metagraph/build && make check + Build-and-Push-Docker: + # adapted from https://docs.github.com/en/actions/guides/publishing-docker-images#publishing-images-to-github-packages + if: github.ref == 'refs/heads/master' + needs: [Linux] + runs-on: ubuntu-20.04 + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: checkout submodules + run: git submodule update --init --recursive + + - name: Log in to the Container registry + uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v3 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + - name: Build and push Docker image + uses: docker/build-push-action@v2 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + + Metagraph-Workflows: + name: Test metagraph workflows + runs-on: ubuntu-20.04 + needs: [Linux] + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python 3.8 + uses: actions/setup-python@v1 + with: + python-version: 3.8 + + - name: fetch static binary + uses: actions/download-artifact@v2 + with: + path: artifacts + + - name: setup metagraph binary + run: | + sudo ln -s $(pwd)/artifacts/metagraph_DNA_linux_x86/metagraph_DNA /usr/local/bin/metagraph + sudo chmod +rx /usr/local/bin/metagraph + /usr/local/bin/metagraph --help + metagraph --help + + - name: Install python dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + pip install -r metagraph/workflows/requirements.txt + - name: Test metagraph-workflows pytest + run: | + cd metagraph/workflows + pytest + Release: name: Create Release if: contains(github.ref, 'tags/v') diff --git a/Dockerfile b/Dockerfile index 9c399a0c1d..d94009c7f3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -87,7 +87,7 @@ FROM ubuntu:20.04 ARG CODE_BASE # the image used in production. It contains a basic runtime environment for metagraph without build tools along with -# the metagraph binary and python API code. This image is published on dockerhub (`ratschlab/metagraph`). +# the metagraph binary and python API code. This image is published on github's container registry (`ghcr.io/ratschlab/metagraph`). RUN apt-get update && apt-get install -y \ libatomic1 \ diff --git a/README.md b/README.md index 662812d1b5..0d1dd1df2f 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,31 @@ At the same time, the provided workflows and their careful implementation, combi ## Install -See [docs online](https://metagraph.ethz.ch/static/docs/index.html). +### Conda + +Install the [latest release](https://github.com/ratschlab/metagraph/releases/latest) on Linux or Mac OS X with Anaconda: + +``` +conda install -c bioconda -c conda-forge metagraph +``` + +### Docker + +If docker is available on the system, immediately get started with + +``` +docker run -v ${HOME}:/mnt ghcr.io/ratschlab/metagraph:master build -v -k 10 \ + -o /mnt/transcripts_1000 \ + /mnt/transcripts_1000.fa +``` + +(Replace `${HOME}` with a directory on the host system to map it under `/mnt` in the container.) + +All different versions of the container are listed [here](https://github.com/ratschlab/metagraph/pkgs/container/metagraph). + +### Install From Sources + +To compile from source, see [documentation online](https://metagraph.ethz.ch/static/docs/installation.html#install-from-source) (e.g., for builds with custom configurations). ## Typical workflow diff --git a/metagraph/api/python/README.rst b/metagraph/api/python/README.rst index 529637dc9f..c8d55debbb 100644 --- a/metagraph/api/python/README.rst +++ b/metagraph/api/python/README.rst @@ -31,3 +31,4 @@ Usage For more examples, see `notebooks <./notebooks>`_. + diff --git a/metagraph/api/python/setup.py b/metagraph/api/python/setup.py index 594a8d1c25..875c425988 100644 --- a/metagraph/api/python/setup.py +++ b/metagraph/api/python/setup.py @@ -31,10 +31,6 @@ 'Programming Language :: Python :: 3.6', ], description="Metagraph Toolkit", - entry_points={ - 'console_scripts': [ - ], - }, install_requires=requirements, license="MIT license", long_description=readme, diff --git a/metagraph/api/python/tests/test_helpers.py b/metagraph/api/python/tests/test_helpers.py index da51b6d636..770c53d00c 100644 --- a/metagraph/api/python/tests/test_helpers.py +++ b/metagraph/api/python/tests/test_helpers.py @@ -14,7 +14,7 @@ def _load_json_data(filename): @pytest.mark.parametrize("file_name,align,expected_shape", [ ('search_response.json', False, (4, 15)), - ('search_with_align_response.json', True, (354, 18)) + ('search_with_align_response.json', True, (354, 15)) ]) def test_df_from_search_result(file_name, align, expected_shape): json_obj = _load_json_data(file_name) @@ -27,9 +27,6 @@ def test_df_from_search_result(file_name, align, expected_shape): 'metasub_name', 'num_reads', 'sample_type', 'station', 'surface_material', 'seq_description'] - if align: - expected_cols = expected_cols + ['sequence', 'score', 'cigar'] - assert list(df.columns) == expected_cols diff --git a/metagraph/docs/source/index.rst b/metagraph/docs/source/index.rst index 1e25093e0b..6f33ad4e3e 100644 --- a/metagraph/docs/source/index.rst +++ b/metagraph/docs/source/index.rst @@ -12,9 +12,8 @@ framework, a software platform for indexing and analysis of very large sequence installation.rst quick_start.rst + workflows.rst api.rst sequence_search.rst sequence_assembly.rst resources.rst - - diff --git a/metagraph/docs/source/installation.rst b/metagraph/docs/source/installation.rst index bc23d57b6a..1b24642e5e 100644 --- a/metagraph/docs/source/installation.rst +++ b/metagraph/docs/source/installation.rst @@ -23,13 +23,15 @@ Docker container If docker is available on your system, you can immediately get started with:: - docker run -v ${DATA_DIR_HOST}:/mnt ratschlab/metagraph \ + docker run -v ${DATA_DIR_HOST}:/mnt ghcr.io/ratschlab/metagraph:latest \ build -v -k 10 -o /mnt/transcripts_1000 /mnt/transcripts_1000.fa where you'd need to replace ``${DATA_DIR_HOST}`` with a directory on the host system to map it under ``/mnt`` in the container. This docker container uses the latest version of MetaGraph from the source `GitHub repository `_ (branch ``master``). +See also the `image overview `_ for +other versions of the metagraph image. Install from source @@ -131,7 +133,7 @@ To compile MetaGraph, please follow these steps. git clone --recursive https://github.com/ratschlab/metagraph.git #. Change into the ``metagraph`` directory:: - + cd metagraph #. Make sure all submodules have been downloaded:: diff --git a/metagraph/docs/source/workflows.rst b/metagraph/docs/source/workflows.rst new file mode 100644 index 0000000000..2a7b0564a8 --- /dev/null +++ b/metagraph/docs/source/workflows.rst @@ -0,0 +1,105 @@ +========= +Workflows +========= + +This package provides workflows for the `metagraph framework +`_ + + +Workflows for Creating Graphs and Annotations +--------------------------------------------- + +Since the creation of graph and indices comprises several steps, this package provides +some support to simplify these tasks - in particular for standard cases. + +Given some raw sequence data and a few options like the kmer size (`k`) graphs and annotations +are automatically built: + +.. code-block:: bash + + metagraph-workflows build -k 5 transcript_paths.txt /tmp/mygraph + + +If you prefer invoking the workflow from within a python script, the following is equivalent: + +.. code-block:: python + + from metagraph_workflows import workflows + workflows.run_build_workflow('/tmp/mygraph', seqs_file_list_path='transcript_paths.txt', k=5) + + + +The workflow logic itself is expressed as a `Snakemake workflow +`_ . You can also directly invoke the workflows +using the `snakemake` command line tool (see below). + + +Installation and Set up +~~~~~~~~~~~~~~~~~~~~~~~ + + +Set up a conda environment and install the necessary packages using: + +.. code-block:: bash + + conda create -n metagraph-workflows python=3.8 + conda activate metagraph-workflows + conda install -c bioconda -c conda-forge metagraph + pip install -U "git+https://github.com/ratschlab/metagraph.git#subdirectory=metagraph/workflows" + + + + +Usage Example +~~~~~~~~~~~~~ + +Typically, the following steps would be performed: + +1. sequence file preparation: add your sequence files of interest into a directory. +2. running workflow: you can invoke the workflow using ``metagraph-workflows build``. Important parameters you may consider tuning are: + + * k + * primary vs non primary graph creation + * annotation label source: ``sequence_headers`` or ``sequence_file_names`` + + An example invocation: + + .. code-block:: bash + + metagraph-workflows build -k 31 \ + --seqs-dir-path [PATH_TO_SEQUENCES] \ + --annotation-labels-source sequence_headers \ + --build-primary-graph + [OUTPUT_DIR] + + see ``metagraph-workflows build --help`` for more help +3. do queries: once you created the indices you can query either by using the command line + query tool or starting the metagraph server on your laptop or another suitable machine and access + do queries using e.g. the python :ref:`API` client. + + +There is also a `jupyter notebook `_ walking you through an example from indexing to api querying. + + + +Workflow Management +~~~~~~~~~~~~~~~~~~~ + +The following snakemake options are exposed in the ``build`` subcommand + + * ``--dryrun``: see what workflow steps would be done + * ``--force`` (corresponds to ``--forceall`` in snakemake): force run all steps + + +Directly Invoking Snakemake Workflow +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The above command is only a wrapper around a snakemake workflow. You can also +directly invoke the snakemake workflow (assuming you checked out the `metagraph git repository `_): + +.. code-block:: bash + + cd metagraph/workflows + snakemake --forceall --configfile default.yml \ + --config k=5 seqs_file_list_path='transcript_paths.txt' output_directory=/tmp/mygraph \ + annotation_labels_source=sequence_headers --cores 2 diff --git a/metagraph/integration_tests/base.py b/metagraph/integration_tests/base.py index 84f3f5aafe..1f0037edd2 100644 --- a/metagraph/integration_tests/base.py +++ b/metagraph/integration_tests/base.py @@ -104,9 +104,16 @@ def _clean(graph, output, extra_params=''): @staticmethod def _annotate_graph(input, graph_path, output, anno_repr, - separate=False, no_fork_opt=False, no_anchor_opt=False): + separate=False, no_fork_opt=False, no_anchor_opt=False, + anno_type='header'): target_anno = anno_repr + + noswap = anno_repr.endswith('_noswap') + if noswap: + anno_repr = anno_repr[:-len('_noswap')] + if (anno_repr in {'row_sparse', 'column_coord'} or + anno_repr.endswith('_coord') or anno_repr.endswith('brwt') or anno_repr.startswith('row_diff')): target_anno = anno_repr @@ -115,7 +122,7 @@ def _annotate_graph(input, graph_path, output, anno_repr, target_anno = anno_repr anno_repr = 'row' - command = f'{METAGRAPH} annotate -p {NUM_THREADS} --anno-header \ + command = f'{METAGRAPH} annotate -p {NUM_THREADS} --anno-{anno_type}\ -i {graph_path} --anno-type {anno_repr} \ -o {output} {input}' @@ -141,6 +148,8 @@ def _annotate_graph(input, graph_path, output, anno_repr, {output + anno_file_extension[anno_repr]}' other_args = ' --count-kmers' if with_counts else '' + other_args += ' --coordinates' if final_anno.endswith('_coord') else '' + other_args += ' --disk-swap \"\"' if noswap else '' if target_anno == 'row_diff': command += ' -i ' + graph_path @@ -170,7 +179,7 @@ def _annotate_graph(input, graph_path, output, anno_repr, assert(res.returncode == 0) if final_anno != target_anno: - rd_type = 'column' if with_counts else 'row_diff' + rd_type = 'column' if with_counts or final_anno.endswith('_coord') else 'row_diff' command = f'{METAGRAPH} transform_anno --anno-type {final_anno} --greedy -o {output} ' \ f'-i {graph_path} -p {NUM_THREADS} {output}.{rd_type}.annodbg' res = subprocess.run([command], shell=True) @@ -178,3 +187,8 @@ def _annotate_graph(input, graph_path, output, anno_repr, os.remove(output + anno_file_extension[rd_type]) else: os.remove(output + anno_file_extension[anno_repr]) + + if final_anno.endswith('brwt') or final_anno.endswith('brwt_coord'): + command = f'{METAGRAPH} relax_brwt -o {output} -p {NUM_THREADS} {output}.{final_anno}.annodbg' + res = subprocess.run([command], shell=True) + assert (res.returncode == 0) diff --git a/metagraph/integration_tests/test_align.py b/metagraph/integration_tests/test_align.py index b7fcb25522..6cfeee9d1f 100644 --- a/metagraph/integration_tests/test_align.py +++ b/metagraph/integration_tests/test_align.py @@ -41,7 +41,7 @@ def test_simple_align_all_graphs(self, representation): self.assertEqual('nodes (k): 16438', params_str[1]) self.assertEqual('mode: basic', params_str[2]) - stats_command = '{exe} align -i {graph} --align-min-exact-match 0.0 {reads}'.format( + stats_command = '{exe} align --align-only-forwards -i {graph} --align-min-exact-match 0.0 {reads}'.format( exe=METAGRAPH, graph=self.tempdir.name + '/genome.MT' + graph_file_extension[representation], reads=TEST_DATA_DIR + '/genome_MT1.fq', @@ -49,13 +49,13 @@ def test_simple_align_all_graphs(self, representation): res = subprocess.run(stats_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) params_str = res.stdout.decode().rstrip().split('\n') - self.assertEqual(len(params_str), 6) + self.assertEqual(len(params_str), 7) self.assertEqual(params_str[0], 'MT-10/1\tAACAGAGAATAGTTTAAATTAGAATCTTAGCTTTGGGTGCTAATGGTGGAGTTAAAGACTTTTTCTCTGATTTGTCCTTGGAAAAAGGTTTTCATCTCCGGTTTACAAGACTGGTGTATTAGTTTATACTACAAGGACAGGCCCATTTGA\t+\tTAGAATCTTAG\t22\t11\t19S11=120S\t0') self.assertEqual(params_str[1], 'MT-8/1\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t+\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t300\t150\t150=\t0') self.assertEqual(params_str[2], 'MT-6/1\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t+\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t300\t150\t150=\t0') self.assertEqual(params_str[3], 'MT-4/1\tAGTATAGTAGTTCGCTTTGACTGGTGAAGTCTTAGCATGTACTGCTCGGAGGTTCGGTTCTGCTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGGACCTGTGGGTTTGTTAGGTACTGTTTGCATTAATAAAT\t*\t*\t0\t*\t*\t*') self.assertEqual(params_str[4], 'MT-2/1\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t+\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t300\t150\t150=\t0') - last_split = params_str[5].split("\t"); + last_split = params_str[5].split("\t") self.assertEqual(last_split[0], "MT-11/1") self.assertEqual(last_split[1], "AACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA") self.assertEqual(last_split[4], "22") @@ -82,13 +82,14 @@ def test_simple_align_map_all_graphs(self, representation): res = subprocess.run(stats_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) params_str = res.stdout.decode().rstrip().split('\n') - self.assertEqual(len(params_str), 6) + self.assertEqual(len(params_str), 7) self.assertEqual(params_str[0], 'MT-10/1\t1/140/1') self.assertEqual(params_str[1], 'MT-8/1\t140/140/140') self.assertEqual(params_str[2], 'MT-6/1\t140/140/140') self.assertEqual(params_str[3], 'MT-4/1\t0/140/0') self.assertEqual(params_str[4], 'MT-2/1\t140/140/140') self.assertEqual(params_str[5], 'MT-11/1\t1/140/1') + self.assertEqual(params_str[6], 'MT-11/1\t1/140/1') @parameterized.expand(GRAPH_TYPES) def test_simple_align_map_canonical_all_graphs(self, representation): @@ -112,13 +113,14 @@ def test_simple_align_map_canonical_all_graphs(self, representation): res = subprocess.run(stats_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) params_str = res.stdout.decode().rstrip().split('\n') - self.assertEqual(len(params_str), 6) + self.assertEqual(len(params_str), 7) self.assertEqual(params_str[0], 'MT-10/1\t140/140/140') self.assertEqual(params_str[1], 'MT-8/1\t140/140/140') self.assertEqual(params_str[2], 'MT-6/1\t140/140/140') self.assertEqual(params_str[3], 'MT-4/1\t129/140/129') self.assertEqual(params_str[4], 'MT-2/1\t140/140/139') self.assertEqual(params_str[5], 'MT-11/1\t2/140/2') + self.assertEqual(params_str[6], 'MT-11/1\t140/140/140') @parameterized.expand(['succinct']) def test_simple_align_json_all_graphs(self, representation): @@ -134,7 +136,7 @@ def test_simple_align_json_all_graphs(self, representation): self.assertEqual('nodes (k): 16438', params_str[1]) self.assertEqual('mode: basic', params_str[2]) - stats_command = '{exe} align -i {graph} --align-min-exact-match 0.0 {reads}'.format( + stats_command = '{exe} align --align-only-forwards -i {graph} --align-min-exact-match 0.0 {reads}'.format( exe=METAGRAPH, graph=self.tempdir.name + '/genome.MT' + graph_file_extension[representation], reads=TEST_DATA_DIR + '/genome_MT1.fq', @@ -142,7 +144,7 @@ def test_simple_align_json_all_graphs(self, representation): res = subprocess.run(stats_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) params_str = res.stdout.decode().rstrip().split('\n') - self.assertEqual(len(params_str), 6) + self.assertEqual(len(params_str), 7) @parameterized.expand(GRAPH_TYPES) def test_simple_align_fwd_rev_comp_all_graphs(self, representation): @@ -158,7 +160,7 @@ def test_simple_align_fwd_rev_comp_all_graphs(self, representation): self.assertEqual('nodes (k): 16438', params_str[1]) self.assertEqual('mode: basic', params_str[2]) - stats_command = '{exe} align --align-both-strands -i {graph} --align-min-exact-match 0.0 {reads}'.format( + stats_command = '{exe} align -i {graph} --align-min-exact-match 0.0 {reads}'.format( exe=METAGRAPH, graph=self.tempdir.name + '/genome.MT' + graph_file_extension[representation], reads=TEST_DATA_DIR + '/genome_MT1.fq', @@ -166,13 +168,13 @@ def test_simple_align_fwd_rev_comp_all_graphs(self, representation): res = subprocess.run(stats_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) params_str = res.stdout.decode().rstrip().split('\n') - self.assertEqual(len(params_str), 6) + self.assertEqual(len(params_str), 7) self.assertEqual(params_str[0], 'MT-10/1\tAACAGAGAATAGTTTAAATTAGAATCTTAGCTTTGGGTGCTAATGGTGGAGTTAAAGACTTTTTCTCTGATTTGTCCTTGGAAAAAGGTTTTCATCTCCGGTTTACAAGACTGGTGTATTAGTTTATACTACAAGGACAGGCCCATTTGA\t-\tTCAAATGGGCCTGTCCTTGTAGTATAAACTAATACACCAGTCTTGTAAACCGGAGATGAAAACCTTTTTCCAAGGACAAATCAGAGAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAAGATTCTAATTTAAACTATTCTCTGTT\t300\t150\t150=\t0') self.assertEqual(params_str[1], 'MT-8/1\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t+\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t300\t150\t150=\t0') self.assertEqual(params_str[2], 'MT-6/1\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t+\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t300\t150\t150=\t0') self.assertEqual(params_str[3], 'MT-4/1\tAGTATAGTAGTTCGCTTTGACTGGTGAAGTCTTAGCATGTACTGCTCGGAGGTTCGGTTCTGCTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGGACCTGTGGGTTTGTTAGGTACTGTTTGCATTAATAAAT\t-\tATTTATTAATGCAAACAGTACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATTAAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCAGTACATGCTAAGACTTCACCAGTCAAAGCGAACTACTATACT\t295\t149\t95=1X54=\t0') self.assertEqual(params_str[4], 'MT-2/1\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t+\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t300\t150\t150=\t0') - last_split = params_str[5].split("\t"); + last_split = params_str[5].split("\t") self.assertEqual(last_split[0], "MT-11/1") self.assertEqual(last_split[1], "AACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA") self.assertEqual(last_split[4], "22") @@ -180,7 +182,6 @@ def test_simple_align_fwd_rev_comp_all_graphs(self, representation): @parameterized.expand(GRAPH_TYPES) def test_simple_align_canonical_all_graphs(self, representation): - self._build_graph(input=TEST_DATA_DIR + '/genome.MT.fa', output=self.tempdir.name + '/genome.MT', k=11, repr=representation, mode='canonical', @@ -200,13 +201,14 @@ def test_simple_align_canonical_all_graphs(self, representation): res = subprocess.run(stats_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) params_str = res.stdout.decode().rstrip().split('\n') - self.assertEqual(len(params_str), 6) + self.maxDiff = None + self.assertEqual(len(params_str), 7) self.assertEqual(params_str[0], 'MT-10/1\tAACAGAGAATAGTTTAAATTAGAATCTTAGCTTTGGGTGCTAATGGTGGAGTTAAAGACTTTTTCTCTGATTTGTCCTTGGAAAAAGGTTTTCATCTCCGGTTTACAAGACTGGTGTATTAGTTTATACTACAAGGACAGGCCCATTTGA\t+\tAACAGAGAATAGTTTAAATTAGAATCTTAGCTTTGGGTGCTAATGGTGGAGTTAAAGACTTTTTCTCTGATTTGTCCTTGGAAAAAGGTTTTCATCTCCGGTTTACAAGACTGGTGTATTAGTTTATACTACAAGGACAGGCCCATTTGA\t300\t150\t150=\t0') self.assertEqual(params_str[1], 'MT-8/1\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t+\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t300\t150\t150=\t0') self.assertEqual(params_str[2], 'MT-6/1\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t+\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t300\t150\t150=\t0') self.assertEqual(params_str[3], 'MT-4/1\tAGTATAGTAGTTCGCTTTGACTGGTGAAGTCTTAGCATGTACTGCTCGGAGGTTCGGTTCTGCTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGGACCTGTGGGTTTGTTAGGTACTGTTTGCATTAATAAAT\t+\tAGTATAGTAGTTCGCTTTGACTGGTGAAGTCTTAGCATGTACTGCTCGGAGGTTGGGTTCTGCTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGGACCTGTGGGTTTGTTAGGTACTGTTTGCATTAATAAAT\t295\t149\t54=1X95=\t0') self.assertEqual(params_str[4], 'MT-2/1\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t+\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t300\t150\t150=\t0') - last_split = params_str[5].split("\t"); + last_split = params_str[5].split("\t") self.assertEqual(last_split[0], "MT-11/1") self.assertEqual(last_split[1], "AACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA") self.assertEqual(last_split[4], "22") @@ -233,7 +235,7 @@ def test_simple_align_canonical_subk_succinct(self, representation): res = subprocess.run(stats_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) params_str = res.stdout.decode().rstrip().split('\n') - self.assertEqual(len(params_str), 6) + self.assertEqual(len(params_str), 7) self.assertEqual(params_str[0], 'MT-10/1\tAACAGAGAATAGTTTAAATTAGAATCTTAGCTTTGGGTGCTAATGGTGGAGTTAAAGACTTTTTCTCTGATTTGTCCTTGGAAAAAGGTTTTCATCTCCGGTTTACAAGACTGGTGTATTAGTTTATACTACAAGGACAGGCCCATTTGA\t+\tAACAGAGAATAGTTTAAATTAGAATCTTAGCTTTGGGTGCTAATGGTGGAGTTAAAGACTTTTTCTCTGATTTGTCCTTGGAAAAAGGTTTTCATCTCCGGTTTACAAGACTGGTGTATTAGTTTATACTACAAGGACAGGCCCATTTGA\t300\t150\t150=\t0') self.assertEqual(params_str[1], 'MT-8/1\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t+\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t300\t150\t150=\t0') self.assertEqual(params_str[2], 'MT-6/1\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t+\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t300\t150\t150=\t0') @@ -263,13 +265,14 @@ def test_simple_align_primary_all_graphs(self, representation): res = subprocess.run(stats_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) params_str = res.stdout.decode().rstrip().split('\n') - self.assertEqual(len(params_str), 6) + self.assertEqual(len(params_str), 7) self.assertEqual(params_str[0], 'MT-10/1\tAACAGAGAATAGTTTAAATTAGAATCTTAGCTTTGGGTGCTAATGGTGGAGTTAAAGACTTTTTCTCTGATTTGTCCTTGGAAAAAGGTTTTCATCTCCGGTTTACAAGACTGGTGTATTAGTTTATACTACAAGGACAGGCCCATTTGA\t+\tAACAGAGAATAGTTTAAATTAGAATCTTAGCTTTGGGTGCTAATGGTGGAGTTAAAGACTTTTTCTCTGATTTGTCCTTGGAAAAAGGTTTTCATCTCCGGTTTACAAGACTGGTGTATTAGTTTATACTACAAGGACAGGCCCATTTGA\t300\t150\t150=\t0') self.assertEqual(params_str[1], 'MT-8/1\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t+\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t300\t150\t150=\t0') self.assertEqual(params_str[2], 'MT-6/1\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t+\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t300\t150\t150=\t0') self.assertEqual(params_str[3], 'MT-4/1\tAGTATAGTAGTTCGCTTTGACTGGTGAAGTCTTAGCATGTACTGCTCGGAGGTTCGGTTCTGCTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGGACCTGTGGGTTTGTTAGGTACTGTTTGCATTAATAAAT\t+\tAGTATAGTAGTTCGCTTTGACTGGTGAAGTCTTAGCATGTACTGCTCGGAGGTTGGGTTCTGCTCCGAGGTCGCCCCAACCGAAATTTTTAATGCAGGTTTGGTAGTTTAGGACCTGTGGGTTTGTTAGGTACTGTTTGCATTAATAAAT\t295\t149\t54=1X95=\t0') self.assertEqual(params_str[4], 'MT-2/1\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t+\tTGTGTTAATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATCATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCAAAC\t300\t150\t150=\t0') - last_split = params_str[5].split("\t"); + self.assertEqual(params_str[6].split("\t")[4], "300") + last_split = params_str[5].split("\t") self.assertEqual(last_split[0], "MT-11/1") self.assertEqual(last_split[1], "AACAGAGAATTGTTTAAATTACAATCTTAGCTATGGGTGCTAAAGGTGGAGTTATAGACTTTTTCACTGATTTGTCGTTGGAAAAAGCTTTTCATCTCGGGTTTACAAGTCTGGTGTATTTGTTTATACTAGAAGGACAGGCGCATTTGA") self.assertEqual(last_split[4], "22") @@ -296,7 +299,7 @@ def test_simple_align_primary_subk_succinct(self, representation): res = subprocess.run(stats_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) params_str = res.stdout.decode().rstrip().split('\n') - self.assertEqual(len(params_str), 6) + self.assertEqual(len(params_str), 7) self.assertEqual(params_str[0], 'MT-10/1\tAACAGAGAATAGTTTAAATTAGAATCTTAGCTTTGGGTGCTAATGGTGGAGTTAAAGACTTTTTCTCTGATTTGTCCTTGGAAAAAGGTTTTCATCTCCGGTTTACAAGACTGGTGTATTAGTTTATACTACAAGGACAGGCCCATTTGA\t+\tAACAGAGAATAGTTTAAATTAGAATCTTAGCTTTGGGTGCTAATGGTGGAGTTAAAGACTTTTTCTCTGATTTGTCCTTGGAAAAAGGTTTTCATCTCCGGTTTACAAGACTGGTGTATTAGTTTATACTACAAGGACAGGCCCATTTGA\t300\t150\t150=\t0') self.assertEqual(params_str[1], 'MT-8/1\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t+\tAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTAC\t300\t150\t150=\t0') self.assertEqual(params_str[2], 'MT-6/1\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t+\tATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATA\t300\t150\t150=\t0') @@ -309,26 +312,24 @@ def test_simple_align_fwd_rev_comp_json_all_graphs(self, representation): self._build_graph(input=TEST_DATA_DIR + '/genome.MT.fa', output=self.tempdir.name + '/genome.MT', - k=11, repr=representation, - extra_params="--mask-dummy") + k=11, repr=representation) res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) params_str = res.stdout.decode().split('\n')[2:] self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 16438', params_str[1]) + self.assertEqual('nodes (k): 16461', params_str[1]) self.assertEqual('mode: basic', params_str[2]) - stats_command = '{exe} align -o {output} --json --align-both-strands -i {graph} --align-min-exact-match 0.0 {reads}'.format( + stats_command = '{exe} align --json -i {graph} --align-min-exact-match 0.0 {reads}'.format( exe=METAGRAPH, graph=self.tempdir.name + '/genome.MT' + graph_file_extension[representation], reads=TEST_DATA_DIR + '/genome_MT1.fq', - output=self.tempdir.name + '/genome.MT' + graph_file_extension[representation] + '.align.json', ) res = subprocess.run(stats_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) - params_str = open(self.tempdir.name + '/genome.MT' + graph_file_extension[representation] + '.align.json', 'r').readlines() - self.assertEqual(len(params_str), 6) - ref_align_str = open(TEST_DATA_DIR + '/genome_MT1.align.json', 'r').readlines() + params_str = res.stdout.decode().rstrip().split('\n') + self.assertEqual(len(params_str), 7) + ref_align_str = [a.rstrip() for a in open(TEST_DATA_DIR + '/genome_MT1.align.json', 'r').readlines()] for [a, b] in zip(params_str, ref_align_str): self.assertEqual(a, b) @@ -337,26 +338,24 @@ def test_simple_align_edit_distance_all_graphs(self, representation): self._build_graph(input=TEST_DATA_DIR + '/genome.MT.fa', output=self.tempdir.name + '/genome.MT', - k=11, repr=representation, - extra_params="--mask-dummy") + k=11, repr=representation) res = self._get_stats(self.tempdir.name + '/genome.MT' + graph_file_extension[representation]) params_str = res.stdout.decode().split('\n')[2:] self.assertEqual('k: 11', params_str[0]) - self.assertEqual('nodes (k): 16438', params_str[1]) + self.assertEqual('nodes (k): 16461', params_str[1]) self.assertEqual('mode: basic', params_str[2]) - stats_command = '{exe} align -o {output} --json --align-both-strands --align-edit-distance -i {graph} --align-min-exact-match 0.0 {reads}'.format( + stats_command = '{exe} align --json --align-edit-distance -i {graph} --align-min-exact-match 0.0 {reads}'.format( exe=METAGRAPH, graph=self.tempdir.name + '/genome.MT' + graph_file_extension[representation], reads=TEST_DATA_DIR + '/genome_MT1.fq', - output=self.tempdir.name + '/genome.MT' + graph_file_extension[representation] + '.align.json', ) res = subprocess.run(stats_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) - params_str = open(self.tempdir.name + '/genome.MT' + graph_file_extension[representation] + '.align.json', 'r').readlines() - self.assertEqual(len(params_str), 6) - ref_align_str = open(TEST_DATA_DIR + '/genome_MT1.align.edit.json', 'r').readlines() + params_str = res.stdout.decode().rstrip().split('\n') + self.assertEqual(len(params_str), 7) + ref_align_str = [a.rstrip() for a in open(TEST_DATA_DIR + '/genome_MT1.align.edit.json', 'r').readlines()] for [a, b] in zip(params_str, ref_align_str): self.assertEqual(a, b) diff --git a/metagraph/integration_tests/test_query.py b/metagraph/integration_tests/test_query.py index 159cab88e3..db8292c280 100644 --- a/metagraph/integration_tests/test_query.py +++ b/metagraph/integration_tests/test_query.py @@ -9,6 +9,7 @@ import numpy as np from helpers import get_test_class_name from base import TestingBase, METAGRAPH, TEST_DATA_DIR, graph_file_extension +import hashlib """Test graph construction""" @@ -18,11 +19,15 @@ anno_file_extension = {'column': '.column.annodbg', 'column_coord': '.column_coord.annodbg', + 'brwt_coord': '.brwt_coord.annodbg', + 'row_diff_coord': '.row_diff_coord.annodbg', + 'row_diff_brwt_coord': '.row_diff_brwt_coord.annodbg', 'row': '.row.annodbg', 'row_diff': '.row_diff.annodbg', 'row_sparse': '.row_sparse.annodbg', 'row_diff_brwt': '.row_diff_brwt.annodbg', 'row_diff_sparse': '.row_diff_sparse.annodbg', + 'row_diff_sparse_noswap': '.row_diff_sparse.annodbg', 'rb_brwt': '.rb_brwt.annodbg', 'brwt': '.brwt.annodbg', 'int_brwt': '.int_brwt.annodbg', @@ -134,6 +139,10 @@ def check_suffix(anno_repr, suffix): assert('labels: 100' == params_str[0]) if cls.graph_repr != 'hashfast' and (cls.graph_repr != 'succinct' or cls.mask_dummy): assert('objects: 46960' == params_str[1]) + + if cls.anno_repr.endswith('_noswap'): + cls.anno_repr = cls.anno_repr[:-len('_noswap')] + assert('representation: ' + cls.anno_repr == params_str[3]) def test_query(self): @@ -239,7 +248,7 @@ def test_query_with_align(self): res = subprocess.run(query_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) if DNA_MODE: - self.assertEqual(len(res.stdout), 12241) + self.assertEqual(len(res.stdout), 12248) else: self.assertEqual(len(res.stdout), 12244) @@ -252,7 +261,7 @@ def test_query_with_align(self): res = subprocess.run(query_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) if DNA_MODE: - self.assertEqual(len(res.stdout), 12347) + self.assertEqual(len(res.stdout), 12354) else: self.assertEqual(len(res.stdout), 12350) @@ -267,7 +276,7 @@ def test_query_with_align(self): res = subprocess.run(query_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) if DNA_MODE: - self.assertEqual(len(res.stdout), 12241) + self.assertEqual(len(res.stdout), 12248) else: self.assertEqual(len(res.stdout), 12244) @@ -281,7 +290,7 @@ def test_query_with_align(self): res = subprocess.run(query_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) if DNA_MODE: - self.assertEqual(len(res.stdout), 12347) + self.assertEqual(len(res.stdout), 12354) else: self.assertEqual(len(res.stdout), 12350) @@ -297,7 +306,7 @@ def test_query_with_align_both(self): ) res = subprocess.run(query_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) - self.assertEqual(len(res.stdout), 20522) + self.assertEqual(len(res.stdout), 24565) query_command = '{exe} query --fwd-and-reverse --align --count-labels -i {graph} -a {annotation} -p {num_theads} --discovery-fraction 0.0 --align-min-exact-match 0.0 {input}'.format( exe=METAGRAPH, @@ -308,7 +317,7 @@ def test_query_with_align_both(self): ) res = subprocess.run(query_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) - self.assertEqual(len(res.stdout), 20636) + self.assertEqual(len(res.stdout), 24777) def test_batch_query(self): query_command = '{exe} query --fast -i {graph} -a {annotation} --discovery-fraction 1.0 {input}'.format( @@ -413,7 +422,7 @@ def test_batch_query_with_align(self): res = subprocess.run(query_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) if DNA_MODE: - self.assertEqual(len(res.stdout), 12241) + self.assertEqual(len(res.stdout), 12248) else: self.assertEqual(len(res.stdout), 12244) @@ -426,7 +435,7 @@ def test_batch_query_with_align(self): res = subprocess.run(query_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) if DNA_MODE: - self.assertEqual(len(res.stdout), 12347) + self.assertEqual(len(res.stdout), 12354) else: self.assertEqual(len(res.stdout), 12350) @@ -441,7 +450,7 @@ def test_batch_query_with_align(self): res = subprocess.run(query_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) if DNA_MODE: - self.assertEqual(len(res.stdout), 12241) + self.assertEqual(len(res.stdout), 12248) else: self.assertEqual(len(res.stdout), 12244) @@ -455,7 +464,7 @@ def test_batch_query_with_align(self): res = subprocess.run(query_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) if DNA_MODE: - self.assertEqual(len(res.stdout), 12347) + self.assertEqual(len(res.stdout), 12354) else: self.assertEqual(len(res.stdout), 12350) @@ -471,7 +480,7 @@ def test_batch_query_with_align_both(self): ) res = subprocess.run(query_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) - self.assertEqual(len(res.stdout), 20522) + self.assertEqual(len(res.stdout), 24565) query_command = '{exe} query --fast --fwd-and-reverse --align --count-labels -i {graph} -a {annotation} -p {num_theads} --discovery-fraction 0.0 --align-min-exact-match 0.0 {input}'.format( exe=METAGRAPH, @@ -482,7 +491,7 @@ def test_batch_query_with_align_both(self): ) res = subprocess.run(query_command.split(), stdout=PIPE) self.assertEqual(res.returncode, 0) - self.assertEqual(len(res.stdout), 20636) + self.assertEqual(len(res.stdout), 24777) def test_batch_query_with_tiny_batch(self): query_command = '{exe} query --fast --batch-size 100 -i {graph} -a {annotation} --discovery-fraction 1.0 {input}'.format( @@ -528,6 +537,123 @@ def test_query_coordinates(self): self.assertEqual(len(res.stdout), 687712) +@parameterized_class(('graph_repr', 'anno_repr'), + input_values=product( + [repr for repr in GRAPH_TYPES if not (repr == 'bitmap' and PROTEIN_MODE)], + ANNO_TYPES + ['row_diff_brwt_separate', + 'row_diff_brwt_no_fork_opt', + 'row_diff_brwt_no_anchor_opt'] + ) + product(['succinct_bloom', 'succinct_mask'], ['flat']), + class_name_func=get_test_class_name +) +class TestQuery1Column(TestingBase): + @classmethod + def setUpClass(cls): + cls.tempdir = TemporaryDirectory() + + cls.with_bloom = False + if cls.graph_repr == 'succinct_bloom': + cls.graph_repr = 'succinct' + cls.with_bloom = True + + cls.mask_dummy = False + if cls.graph_repr == 'succinct_mask': + cls.graph_repr = 'succinct' + cls.mask_dummy = True + + construct_command = '{exe} build {mask_dummy} -p {num_threads} \ + --graph {repr} -k 20 -o {outfile} {input}'.format( + exe=METAGRAPH, + mask_dummy='--mask-dummy' if cls.mask_dummy else '', + num_threads=NUM_THREADS, + repr=cls.graph_repr, + outfile=cls.tempdir.name + '/graph', + input=TEST_DATA_DIR + '/transcripts_100.fa' + ) + + res = subprocess.run([construct_command], shell=True) + assert(res.returncode == 0) + + stats_command = '{exe} stats {graph}'.format( + exe=METAGRAPH, + graph=cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr], + ) + res = subprocess.run(stats_command.split(), stdout=PIPE) + assert(res.returncode == 0) + params_str = res.stdout.decode().split('\n')[2:] + assert('k: 20' == params_str[0]) + if cls.graph_repr != 'succinct' or cls.mask_dummy: + assert('nodes (k): 46960' == params_str[1]) + assert('mode: basic' == params_str[2]) + + if cls.with_bloom: + convert_command = '{exe} transform -o {outfile} --initialize-bloom {bloom_param} {input}'.format( + exe=METAGRAPH, + outfile=cls.tempdir.name + '/graph', + bloom_param='--bloom-fpp 0.1', + input=cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr], + ) + res = subprocess.run([convert_command], shell=True) + assert(res.returncode == 0) + + def check_suffix(anno_repr, suffix): + match = anno_repr.endswith(suffix) + if match: + anno_repr = anno_repr[:-len(suffix)] + return anno_repr, match + + cls.anno_repr, separate = check_suffix(cls.anno_repr, '_separate') + cls.anno_repr, no_fork_opt = check_suffix(cls.anno_repr, '_no_fork_opt') + cls.anno_repr, no_anchor_opt = check_suffix(cls.anno_repr, '_no_anchor_opt') + + cls._annotate_graph( + TEST_DATA_DIR + '/transcripts_100.fa', + cls.tempdir.name + '/graph' + graph_file_extension[cls.graph_repr], + cls.tempdir.name + '/annotation', + cls.anno_repr, + separate, + no_fork_opt, + no_anchor_opt, + anno_type='label 1' + ) + + # check annotation + anno_stats_command = '{exe} stats -a {annotation}'.format( + exe=METAGRAPH, + annotation=cls.tempdir.name + '/annotation' + anno_file_extension[cls.anno_repr], + ) + res = subprocess.run(anno_stats_command.split(), stdout=PIPE) + assert(res.returncode == 0) + params_str = res.stdout.decode().split('\n')[2:] + assert('labels: 1' == params_str[0]) + if cls.graph_repr != 'hashfast' and (cls.graph_repr != 'succinct' or cls.mask_dummy): + assert('objects: 46960' == params_str[1]) + + if cls.anno_repr.endswith('_noswap'): + cls.anno_repr = cls.anno_repr[:-len('_noswap')] + + assert('representation: ' + cls.anno_repr == params_str[3]) + + def test_query(self): + query_command = f'{METAGRAPH} query \ + -i {self.tempdir.name}/graph{graph_file_extension[self.graph_repr]} \ + -a {self.tempdir.name}/annotation{anno_file_extension[self.anno_repr]} \ + --discovery-fraction 1.0 \ + {TEST_DATA_DIR}/transcripts_1000.fa' + res = subprocess.run(query_command.split(), stdout=PIPE) + self.assertEqual(res.returncode, 0) + self.assertEqual(hashlib.sha224(res.stdout).hexdigest(), '254d173abb255a81a4ab8a685201a73de8dbad4546c378e0a645d454') + + query_command = f'{METAGRAPH} query --count-labels \ + -i {self.tempdir.name}/graph{graph_file_extension[self.graph_repr]} \ + -a {self.tempdir.name}/annotation{anno_file_extension[self.anno_repr]} \ + --discovery-fraction 1.0 \ + {TEST_DATA_DIR}/transcripts_1000.fa' + res = subprocess.run(query_command.split(), stdout=PIPE) + self.assertEqual(res.returncode, 0) + self.assertEqual(hashlib.sha224(res.stdout).hexdigest(), '1bd6c24373812064c3e17e73533de7b1e30baa3cca3a64b460e83cb4') + + @parameterized_class(('graph_repr', 'anno_repr'), input_values=product( [repr for repr in GRAPH_TYPES if not (repr == 'bitmap' and PROTEIN_MODE)], @@ -831,6 +957,10 @@ def setUpClass(cls): assert('labels: 100' == params_str[0]) if cls.graph_repr != 'hashfast' and (cls.graph_repr != 'succinct' or cls.mask_dummy): assert('objects: 91584' == params_str[1]) + + if cls.anno_repr.endswith('_noswap'): + cls.anno_repr = cls.anno_repr[:-len('_noswap')] + assert('representation: ' + cls.anno_repr == params_str[3]) def test_query(self): @@ -1037,6 +1167,10 @@ def setUpClass(cls): assert('labels: 100' == params_str[0]) if cls.graph_repr != 'hashfast' and (cls.graph_repr != 'succinct' or cls.mask_dummy): assert('objects: 45792' == params_str[1]) + + if cls.anno_repr.endswith('_noswap'): + cls.anno_repr = cls.anno_repr[:-len('_noswap')] + assert('representation: ' + cls.anno_repr == params_str[3]) def test_query(self): diff --git a/metagraph/src/annotation/binary_matrix/multi_brwt/brwt_builders.cpp b/metagraph/src/annotation/binary_matrix/multi_brwt/brwt_builders.cpp index e60f30892e..9bc78e4f2b 100644 --- a/metagraph/src/annotation/binary_matrix/multi_brwt/brwt_builders.cpp +++ b/metagraph/src/annotation/binary_matrix/multi_brwt/brwt_builders.cpp @@ -18,7 +18,7 @@ using mtg::common::logger; BRWTBottomUpBuilder::Partitioner BRWTBottomUpBuilder::get_basic_partitioner(size_t arity) { - assert(arity > 1u); + assert(arity > 0u); return [arity](const VectorPtrs &vectors) { if (!vectors.size()) @@ -200,8 +200,35 @@ BRWT BRWTBottomUpBuilder::build( size_t num_nodes_parallel, size_t num_threads) { - if (!linkage.size()) - return BRWT(); + if (!linkage.size()) { + logger->warn("Passed no linkage rules. Assembling Multi-BRWT without internal nodes..."); + + std::vector> columns; + + std::mutex mu; + uint64_t num_rows = 0; + get_columns([&](uint64_t i, std::unique_ptr&& column) { + std::unique_lock lock(mu); + + uint64_t size = column->size(); + if (!num_rows) + num_rows = size; + + if (size != num_rows) { + logger->error("Can't merge columns of different size"); + exit(1); + } + + while (i >= columns.size()) { + columns.emplace_back(); + } + assert(!columns[i]); + columns[i] = std::move(column); + }); + + return build(std::move(columns), get_basic_partitioner(columns.size()), + num_nodes_parallel, num_threads); + } std::function dump_node; std::function get_node; diff --git a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp index bc320c03d0..ab4b69b741 100644 --- a/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp +++ b/metagraph/src/annotation/binary_matrix/row_diff/row_diff.hpp @@ -173,6 +173,10 @@ RowDiff::get_rows(const std::vector &row_ids) const { VectorMap node_to_rd; node_to_rd.reserve(row_ids.size() * RD_PATH_RESERVE_SIZE); + // keeps how many times rows in |rd_rows| will be queried + std::vector times_traversed; + times_traversed.reserve(row_ids.size() * RD_PATH_RESERVE_SIZE); + // Truncated row-diff paths, indexes to |rd_rows|. // The last index in each path points to an anchor or to a row which had // been reached before, and thus, will be reconstructed before this one. @@ -189,10 +193,13 @@ RowDiff::get_rows(const std::vector &row_ids) const { // The annotation for that node will have been reconstructed earlier // than for other nodes in this path as well. Thus, we will start // reconstruction from that node and don't need its successors. - if (!is_new) + if (!is_new) { + times_traversed[it.value()]++; break; + } rd_ids.push_back(row); + times_traversed.push_back(1); if (anchor_[row]) break; @@ -206,6 +213,7 @@ RowDiff::get_rows(const std::vector &row_ids) const { node_to_rd = VectorMap(); std::vector rd_rows = diffs_.get_rows(rd_ids); + common::logger->trace("Queried batch of {} diffed rows", rd_ids.size()); rd_ids = std::vector(); @@ -219,9 +227,16 @@ RowDiff::get_rows(const std::vector &row_ids) const { std::sort(rd_rows[*it].begin(), rd_rows[*it].end()); add_diff(rd_rows[*it], &result); // replace diff row with full reconstructed annotation - rd_rows[*it] = result; + if (--times_traversed[*it]) { + rd_rows[*it] = result; + } else { + // free memory + rd_rows[*it] = {}; + } } } + common::logger->trace("Reconstructed annotations for {} rows", rows.size()); + assert(times_traversed == std::vector(rd_rows.size(), 0)); return rows; } diff --git a/metagraph/src/annotation/int_matrix/base/int_matrix.hpp b/metagraph/src/annotation/int_matrix/base/int_matrix.hpp index 00beb51f6e..a0a4c7d36e 100644 --- a/metagraph/src/annotation/int_matrix/base/int_matrix.hpp +++ b/metagraph/src/annotation/int_matrix/base/int_matrix.hpp @@ -27,6 +27,8 @@ class IntMatrix : public binmat::BinaryMatrix { virtual RowValues sum_row_values(const std::vector> &index_counts, size_t min_count = 1) const; + + virtual const binmat::BinaryMatrix& get_binary_matrix() const { return *this; } }; @@ -52,9 +54,6 @@ class MultiIntMatrix : public IntMatrix { virtual std::vector get_row_tuples(const std::vector &rows) const = 0; - - virtual bool load_tuples(std::istream &in) = 0; - virtual void serialize_tuples(std::ostream &out) const = 0; }; } // namespace matrix diff --git a/metagraph/src/annotation/int_matrix/rank_extended/tuple_csc_matrix.hpp b/metagraph/src/annotation/int_matrix/rank_extended/tuple_csc_matrix.hpp index 08c5a78b11..d2aa546312 100644 --- a/metagraph/src/annotation/int_matrix/rank_extended/tuple_csc_matrix.hpp +++ b/metagraph/src/annotation/int_matrix/rank_extended/tuple_csc_matrix.hpp @@ -28,6 +28,13 @@ class TupleCSCMatrix : public MultiIntMatrix { TupleCSCMatrix(BaseMatrix&& index_matrix) : binary_matrix_(std::move(index_matrix)) {} + TupleCSCMatrix(BaseMatrix&& index_matrix, + std::vector&& delimiters, + std::vector&& column_values) + : binary_matrix_(std::move(index_matrix)), + delimiters_(std::move(delimiters)), + column_values_(std::move(column_values)) {} + // return tuple sizes (if not zero) at each entry RowValues get_row_values(Row row) const; @@ -63,6 +70,8 @@ class TupleCSCMatrix : public MultiIntMatrix { bool load(std::istream &in); void serialize(std::ostream &out) const; + template + static void load_tuples(std::istream &in, uint64_t num_columns, const Callback &callback); bool load_tuples(std::istream &in); void serialize_tuples(std::ostream &out) const; @@ -168,20 +177,33 @@ inline bool TupleCSCMatrix::load_tuples(std::istream delimiters_.clear(); column_values_.clear(); - delimiters_.resize(num_columns()); - column_values_.resize(num_columns()); - for (size_t j = 0; j < column_values_.size(); ++j) { - try { - delimiters_[j].load(in); - column_values_[j].load(in); - } catch (...) { - common::logger->error("Couldn't load tuple attributes for column {}", j); - return false; - } + delimiters_.reserve(num_columns()); + column_values_.reserve(num_columns()); + try { + load_tuples(in, num_columns(), [&](Delims&& delims, Values&& values) { + delimiters_.push_back(std::move(delims)); + column_values_.push_back(std::move(values)); + }); + } catch (...) { + common::logger->error("Couldn't load tuple attributes"); + return false; } return true; } +template +template +inline void TupleCSCMatrix +::load_tuples(std::istream &in, uint64_t num_columns, const Callback &callback) { + for (size_t j = 0; j < num_columns; ++j) { + Delims delims; + delims.load(in); + Values column_values; + column_values.load(in); + callback(std::move(delims), std::move(column_values)); + } +} + template inline void TupleCSCMatrix::serialize(std::ostream &out) const { binary_matrix_.serialize(out); diff --git a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp index 44726a8bef..b0329d4834 100644 --- a/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp +++ b/metagraph/src/annotation/int_matrix/row_diff/int_row_diff.hpp @@ -48,7 +48,7 @@ class IntRowDiff : public binmat::IRowDiff, public IntMatrix { public: using anchor_bv_type = bit_vector_small; using fork_succ_bv_type = bit_vector_small; - static_assert(std::is_convertible::value); + static_assert(std::is_convertible::value); IntRowDiff() {} diff --git a/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp new file mode 100644 index 0000000000..e324c7b45b --- /dev/null +++ b/metagraph/src/annotation/int_matrix/row_diff/tuple_row_diff.hpp @@ -0,0 +1,316 @@ +#ifndef __TUPLE_ROW_DIFF_HPP__ +#define __TUPLE_ROW_DIFF_HPP__ + +#include +#include +#include +#include +#include + +#include "common/vectors/bit_vector_adaptive.hpp" +#include "common/vector_map.hpp" +#include "common/vector.hpp" +#include "common/logger.hpp" +#include "common/utils/template_utils.hpp" +#include "graph/annotated_dbg.hpp" +#include "graph/representation/succinct/dbg_succinct.hpp" +#include "annotation/binary_matrix/row_diff/row_diff.hpp" +#include "annotation/int_matrix/base/int_matrix.hpp" + + +namespace mtg { +namespace annot { +namespace matrix { + +template +class TupleRowDiff : public binmat::IRowDiff, public MultiIntMatrix { + public: + using anchor_bv_type = bit_vector_small; + using fork_succ_bv_type = bit_vector_small; + static_assert(std::is_convertible::value); + static const int SHIFT = 1; // coordinates increase by 1 at each edge + + TupleRowDiff() {} + + TupleRowDiff(const graph::DBGSuccinct *graph, BaseMatrix&& diff) + : diffs_(std::move(diff)) { graph_ = graph; } + + bool get(Row i, Column j) const override; + std::vector get_column(Column j) const override; + SetBitPositions get_row(Row i) const override; + std::vector get_rows(const std::vector &rows) const override; + RowTuples get_row_tuples(Row i) const override; + std::vector get_row_tuples(const std::vector &rows) const override; + + uint64_t num_columns() const override { return diffs_.num_columns(); } + uint64_t num_relations() const override { return diffs_.num_relations(); } + uint64_t num_attributes() const override { return diffs_.num_attributes(); } + uint64_t num_rows() const override { return diffs_.num_rows(); } + + bool load(std::istream &in) override; + void serialize(std::ostream &out) const override; + + void load_fork_succ(const std::string &filename); + void load_anchor(const std::string &filename); + + const anchor_bv_type& anchor() const { return anchor_; } + const BaseMatrix& diffs() const { return diffs_; } + BaseMatrix& diffs() { return diffs_; } + + private: + static void decode_diffs(RowTuples *diffs); + static void add_diff(const RowTuples &diff, RowTuples *row); + + BaseMatrix diffs_; + anchor_bv_type anchor_; + fork_succ_bv_type fork_succ_; +}; + + +template +bool TupleRowDiff::get(Row i, Column j) const { + SetBitPositions set_bits = get_row(i); + auto v = std::lower_bound(set_bits.begin(), set_bits.end(), j); + return v != set_bits.end() && *v == j; +} + +template +std::vector TupleRowDiff::get_column(Column j) const { + assert(graph_ && "graph must be loaded"); + assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); + assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); + + // TODO: implement a more efficient algorithm + std::vector result; + for (Row i = 0; i < num_rows(); ++i) { + if (get(i, j)) + result.push_back(i); + } + return result; +} + +template +MultiIntMatrix::SetBitPositions TupleRowDiff::get_row(Row i) const { + RowTuples row = get_row_tuples(i); + SetBitPositions result(row.size()); + for (size_t k = 0; k < row.size(); ++k) { + result[k] = row[k].first; + } + return result; +} + +template +std::vector +TupleRowDiff::get_rows(const std::vector &row_ids) const { + std::vector result; + result.reserve(row_ids.size()); + + for (auto&& row : get_row_tuples(row_ids)) { + result.emplace_back(row.size()); + for (size_t k = 0; k < row.size(); ++k) { + result.back()[k] = row[k].first; + } + row = RowTuples(); + } + + return result; +} + +template +MultiIntMatrix::RowTuples TupleRowDiff::get_row_tuples(Row row) const { + return get_row_tuples(std::vector{ row })[0]; +} + +template +std::vector +TupleRowDiff::get_row_tuples(const std::vector &row_ids) const { + assert(graph_ && "graph must be loaded"); + assert(anchor_.size() == diffs_.num_rows() && "anchors must be loaded"); + assert(!fork_succ_.size() || fork_succ_.size() == graph_->num_nodes() + 1); + + const size_t RD_PATH_RESERVE_SIZE = 2; + + // diff rows annotating nodes along the row-diff paths + std::vector rd_ids; + rd_ids.reserve(row_ids.size() * RD_PATH_RESERVE_SIZE); + + // map row index to its index in |rd_rows| + VectorMap node_to_rd; + node_to_rd.reserve(row_ids.size() * RD_PATH_RESERVE_SIZE); + + // Truncated row-diff paths, indexes to |rd_rows|. + // The last index in each path points to an anchor or to a row which had + // been reached before, and thus, will be reconstructed before this one. + std::vector>> rd_paths_trunc(row_ids.size()); + + for (size_t i = 0; i < row_ids.size(); ++i) { + std::vector> &rd_path = rd_paths_trunc[i]; + + std::vector path; + Vector> queue; + queue.emplace_back(0, row_ids[i]); + + while (queue.size()) { + size_t depth = queue.back().first; + Row row = queue.back().second; + queue.pop_back(); + while (depth < path.size()) { + assert(path.size() > 1); + rd_path.emplace_back(*(path.rbegin() + 1), *path.rbegin()); + path.pop_back(); + } + auto [it, is_new] = node_to_rd.try_emplace(row, rd_ids.size()); + path.push_back(it.value()); + // If a node had been reached before, we interrupt the diff path. + // The annotation for that node will have been reconstructed earlier + // than for other nodes in this path as well. Thus, we will start + // reconstruction from that node and don't need its successors. + if (!is_new) + continue; + + rd_ids.push_back(row); + + if (anchor_[row]) + continue; + + auto node = graph::AnnotatedSequenceGraph::anno_to_graph_index(row); + graph_->call_row_diff_successors(node, fork_succ_, [&](auto succ) { + queue.emplace_back(depth + 1, graph::AnnotatedSequenceGraph::graph_to_anno_index(succ)); + }); + } + + while (path.size() > 1) { + rd_path.emplace_back(*(path.rbegin() + 1), *path.rbegin()); + path.pop_back(); + } + assert(path.size()); + rd_path.emplace_back(-1, path[0]); + } + + node_to_rd = VectorMap(); + + std::vector rd_rows = diffs_.get_row_tuples(rd_ids); + for (auto &row : rd_rows) { + decode_diffs(&row); + std::sort(row.begin(), row.end()); + } + + rd_ids = std::vector(); + + // reconstruct annotation rows from row-diff + std::vector rows(row_ids.size()); + + for (size_t i = 0; i < row_ids.size(); ++i) { + const auto &rd_path = rd_paths_trunc[i]; + // propagate back and reconstruct full annotations for predecessors + for (size_t j = 0; j + 1 < rd_path.size(); ++j) { + auto [node, succ] = rd_path[j]; + // reconstruct annotation by adding the diff (full succ + diff) + add_diff(rd_rows[succ], &rd_rows[node]); + } + rows[i] = rd_rows[rd_path.back().second]; + assert(std::all_of(rows[i].begin(), rows[i].end(), + [](auto &p) { return p.second.size(); })); + } + + return rows; +} + +template +bool TupleRowDiff::load(std::istream &in) { + std::string version(4, '\0'); + in.read(version.data(), 4); + return anchor_.load(in) && fork_succ_.load(in) && diffs_.load(in); +} + +template +void TupleRowDiff::serialize(std::ostream &out) const { + out.write("v2.0", 4); + anchor_.serialize(out); + fork_succ_.serialize(out); + diffs_.serialize(out); +} + +template +void TupleRowDiff::decode_diffs(RowTuples *diffs) { + std::ignore = diffs; + // no encoding +} + +template +void TupleRowDiff::add_diff(const RowTuples &diff, RowTuples *row) { + assert(std::is_sorted(row->begin(), row->end())); + assert(std::is_sorted(diff.begin(), diff.end())); + + if (diff.size()) { + RowTuples result; + result.reserve(row->size() + diff.size()); + + auto it = row->begin(); + auto it2 = diff.begin(); + while (it != row->end() && it2 != diff.end()) { + if (it->first < it2->first) { + result.push_back(*it); + ++it; + } else if (it->first > it2->first) { + result.push_back(*it2); + ++it2; + } else { + if (it2->second.size()) { + result.emplace_back(it->first, Tuple{}); + std::set_symmetric_difference(it->second.begin(), it->second.end(), + it2->second.begin(), it2->second.end(), + std::back_inserter(result.back().second)); + } + ++it; + ++it2; + } + } + std::copy(it, row->end(), std::back_inserter(result)); + std::copy(it2, diff.end(), std::back_inserter(result)); + + row->swap(result); + } + + assert(std::is_sorted(row->begin(), row->end())); + for (auto &[j, tuple] : *row) { + assert(std::is_sorted(tuple.begin(), tuple.end())); + for (uint64_t &c : tuple) { + c -= SHIFT; + } + } +} + +template +void TupleRowDiff::load_anchor(const std::string &filename) { + if (!std::filesystem::exists(filename)) { + common::logger->error("Can't read anchor file: {}", filename); + std::exit(1); + } + std::ifstream in(filename, ios::binary); + if (!in.good()) { + common::logger->error("Could not open anchor file {}", filename); + std::exit(1); + } + anchor_.load(in); +} + +template +void TupleRowDiff::load_fork_succ(const std::string &filename) { + if (!std::filesystem::exists(filename)) { + common::logger->error("Can't read fork successor file: {}", filename); + std::exit(1); + } + std::ifstream in(filename, ios::binary); + if (!in.good()) { + common::logger->error("Could not open fork successor file {}", filename); + std::exit(1); + } + fork_succ_.load(in); +} + +} // namespace matrix +} // namespace annot +} // namespace mtg + +#endif // __TUPLE_ROW_DIFF_HPP__ diff --git a/metagraph/src/annotation/representation/annotation_matrix/annotation_matrix.cpp b/metagraph/src/annotation/representation/annotation_matrix/annotation_matrix.cpp index 32f06e4909..2bc4817c09 100644 --- a/metagraph/src/annotation/representation/annotation_matrix/annotation_matrix.cpp +++ b/metagraph/src/annotation/representation/annotation_matrix/annotation_matrix.cpp @@ -215,6 +215,10 @@ template class StaticBinRelAnnotator; template class StaticBinRelAnnotator, std::string>; +template class StaticBinRelAnnotator, std::string>; + +template class StaticBinRelAnnotator>, std::string>; +template class StaticBinRelAnnotator>, std::string>; } // namespace annot } // namespace mtg diff --git a/metagraph/src/annotation/representation/annotation_matrix/static_annotators_def.hpp b/metagraph/src/annotation/representation/annotation_matrix/static_annotators_def.hpp index d1f53f31ac..cceafbea85 100644 --- a/metagraph/src/annotation/representation/annotation_matrix/static_annotators_def.hpp +++ b/metagraph/src/annotation/representation/annotation_matrix/static_annotators_def.hpp @@ -15,6 +15,7 @@ #include "annotation/binary_matrix/row_vector/unique_row_binmat.hpp" #include "annotation/int_matrix/rank_extended/csc_matrix.hpp" #include "annotation/int_matrix/row_diff/int_row_diff.hpp" +#include "annotation/int_matrix/row_diff/tuple_row_diff.hpp" #include "annotation/int_matrix/csr_matrix/csr_matrix.hpp" #include "annotation/int_matrix/rank_extended/tuple_csc_matrix.hpp" @@ -54,6 +55,12 @@ typedef StaticBinRelAnnotator IntRowAnnotator; typedef StaticBinRelAnnotator, std::string> ColumnCoordAnnotator; +typedef StaticBinRelAnnotator, std::string> MultiBRWTCoordAnnotator; + +typedef StaticBinRelAnnotator>, std::string> RowDiffCoordAnnotator; + +typedef StaticBinRelAnnotator>, std::string> RowDiffBRWTCoordAnnotator; + template <> inline const std::string RowFlatAnnotator::kExtension = ".flat.annodbg"; @@ -85,6 +92,12 @@ template <> inline const std::string IntRowAnnotator::kExtension = ".int_csr.annodbg"; template <> inline const std::string ColumnCoordAnnotator::kExtension = ".column_coord.annodbg"; +template <> +inline const std::string MultiBRWTCoordAnnotator::kExtension = ".brwt_coord.annodbg"; +template <> +inline const std::string RowDiffCoordAnnotator::kExtension = ".row_diff_coord.annodbg"; +template <> +inline const std::string RowDiffBRWTCoordAnnotator::kExtension = ".row_diff_brwt_coord.annodbg"; } // namespace annot } // namespace mtg diff --git a/metagraph/src/annotation/representation/base/annotation.cpp b/metagraph/src/annotation/representation/base/annotation.cpp index 95659b8ee5..9fec93e3c1 100644 --- a/metagraph/src/annotation/representation/base/annotation.cpp +++ b/metagraph/src/annotation/representation/base/annotation.cpp @@ -130,6 +130,13 @@ ::add_label_coord(Index, const VLabels &, uint64_t) { exit(1); } +template +void MultiLabelAnnotation +::add_label_coords(const std::vector> &, const VLabels &) { + logger->error("Adding relation attributes is not implemented for this annotator"); + exit(1); +} + template class MultiLabelEncoded; template class LabelEncoder; diff --git a/metagraph/src/annotation/representation/base/annotation.hpp b/metagraph/src/annotation/representation/base/annotation.hpp index 92e0f29aab..7b555c4d16 100644 --- a/metagraph/src/annotation/representation/base/annotation.hpp +++ b/metagraph/src/annotation/representation/base/annotation.hpp @@ -56,6 +56,9 @@ class MultiLabelAnnotation const std::vector &counts); // for each label and index 'i' add numeric attribute 'coord' virtual void add_label_coord(Index i, const VLabels &labels, uint64_t coord); + // for each label and index 'i' add numeric attribute 'coord' + virtual void add_label_coords(const std::vector> &coords, + const VLabels &labels); virtual bool has_label(Index i, const Label &label) const = 0; virtual bool has_labels(Index i, const VLabels &labels) const = 0; diff --git a/metagraph/src/annotation/representation/column_compressed/annotate_column_compressed.cpp b/metagraph/src/annotation/representation/column_compressed/annotate_column_compressed.cpp index 8cdf19f6ce..cfbea2725d 100644 --- a/metagraph/src/annotation/representation/column_compressed/annotate_column_compressed.cpp +++ b/metagraph/src/annotation/representation/column_compressed/annotate_column_compressed.cpp @@ -161,6 +161,18 @@ void ColumnCompressed