diff --git a/CHANGES b/CHANGES index 613ff8c5..e1166f48 100644 --- a/CHANGES +++ b/CHANGES @@ -23,10 +23,13 @@ Changes Enhancements -* new workflows module (PR #217) +* new workflows registry that contains each EnsembleAnalysis for which + a workflows module exists, for use with workflows base module (#229) +* new workflows base module that provides iterative workflow use for + directories that contain multiple projects (#229) +* new workflows module (#217) * new automated dihedral analysis workflow (detect dihedrals with SMARTS, - analyze with EnsembleAnalysis, and generate seaborn violinplots) - PR #217) + analyze with EnsembleAnalysis, and generate seaborn violinplots) (#217) Fixes @@ -36,7 +39,7 @@ Fixes * fix ensemble.EnsembleAnalysis.check_groups_from_common_ensemble (#212) -2021-01-03 0.8.0 +2022-01-03 0.8.0 ALescoulie, orbeckst Changes diff --git a/doc/sphinx/source/workflows.txt b/doc/sphinx/source/workflows.txt index 0c8e9e68..8d2e42ec 100644 --- a/doc/sphinx/source/workflows.txt +++ b/doc/sphinx/source/workflows.txt @@ -14,4 +14,6 @@ for use with :class:`~mdpow.analysis.dihedral.DihedralAnalysis`. .. toctree:: :maxdepth: 1 + workflows/base + workflows/registry workflows/dihedrals diff --git a/doc/sphinx/source/workflows/base.txt b/doc/sphinx/source/workflows/base.txt new file mode 100644 index 00000000..e16d813c --- /dev/null +++ b/doc/sphinx/source/workflows/base.txt @@ -0,0 +1,7 @@ +============== +Workflows Base +============== + +.. versionadded:: 0.9.0 + +.. automodule:: mdpow.workflows.base diff --git a/doc/sphinx/source/workflows/registry.txt b/doc/sphinx/source/workflows/registry.txt new file mode 100644 index 00000000..a4465e37 --- /dev/null +++ b/doc/sphinx/source/workflows/registry.txt @@ -0,0 +1,7 @@ +================== +Workflows Registry +================== + +.. versionadded:: 0.9.0 + +.. automodule:: mdpow.workflows.registry diff --git a/mdpow/tests/__init__.py b/mdpow/tests/__init__.py index 732ab050..46021c4f 100644 --- a/mdpow/tests/__init__.py +++ b/mdpow/tests/__init__.py @@ -13,5 +13,6 @@ "FEP": RESOURCES.join("states", "FEP"), "base": RESOURCES.join("states", "base"), "md_npt": RESOURCES.join("states", "FEP"), + "workflows": RESOURCES.join("states", "workflows"), } CONFIGURATIONS = RESOURCES.join("test_configurations") diff --git a/mdpow/tests/test_workflows_base.py b/mdpow/tests/test_workflows_base.py new file mode 100644 index 00000000..bf259048 --- /dev/null +++ b/mdpow/tests/test_workflows_base.py @@ -0,0 +1,101 @@ +import re +import os +import sys +import yaml +import pybol +import pytest +import pathlib +import logging + +import pandas as pd + +from . import RESOURCES +from . import STATES + +import py.path + +from ..workflows import base + +from pkg_resources import resource_filename + +RESOURCES = pathlib.PurePath(resource_filename(__name__, 'testing_resources')) +MANIFEST = RESOURCES / 'manifest.yml' + +@pytest.fixture(scope='function') +def molname_workflows_directory(tmp_path): + m = pybol.Manifest(str(MANIFEST)) + m.assemble('workflows', tmp_path) + return tmp_path + +class TestWorkflowsBase(object): + + @pytest.fixture(scope='function') + def SM_tmp_dir(self, molname_workflows_directory): + dirname = molname_workflows_directory + return dirname + + @pytest.fixture(scope='function') + def csv_input_data(self): + csv_path = STATES['workflows'] / 'project_paths.csv' + csv_df = pd.read_csv(csv_path).reset_index(drop=True) + return csv_path, csv_df + + @pytest.fixture(scope='function') + def test_df_data(self): + test_dict = {'molecule' : ['SM25', 'SM26'], + 'resname' : ['SM25', 'SM26']} + test_df = pd.DataFrame(test_dict).reset_index(drop=True) + return test_df + + @pytest.fixture(scope='function') + def project_paths_data(self, SM_tmp_dir): + project_paths = base.project_paths(parent_directory=SM_tmp_dir) + return project_paths + + def test_project_paths(self, test_df_data, project_paths_data): + test_df = test_df_data + project_paths = project_paths_data + + assert project_paths['molecule'][0] == test_df['molecule'][0] + assert project_paths['molecule'][1] == test_df['molecule'][1] + assert project_paths['resname'][0] == test_df['resname'][0] + assert project_paths['resname'][1] == test_df['resname'][1] + + def test_project_paths_csv_input(self, csv_input_data): + csv_path, csv_df = csv_input_data + project_paths = base.project_paths(csv=csv_path) + + pd.testing.assert_frame_equal(project_paths, csv_df) + + def test_automated_project_analysis(self, project_paths_data, caplog): + project_paths = project_paths_data + # change resname to match topology (every SAMPL7 resname is 'UNK') + # only necessary for this dataset, not necessary for normal use + project_paths['resname'] = 'UNK' + + base.automated_project_analysis(project_paths, solvents=('water',), + ensemble_analysis='DihedralAnalysis') + + assert 'all analyses completed' in caplog.text, ('automated_dihedral_analysis ' + 'did not iteratively run to completion for the provided project') + + def test_automated_project_analysis_KeyError(self, project_paths_data, caplog): + caplog.clear() + caplog.set_level(logging.ERROR, logger='mdpow.workflows.base') + + project_paths = project_paths_data + # change resname to match topology (every SAMPL7 resname is 'UNK') + # only necessary for this dataset, not necessary for normal use + project_paths['resname'] = 'UNK' + + # test error output when raised + with pytest.raises(KeyError, + match="Invalid ensemble_analysis 'DarthVaderAnalysis'. " + "An EnsembleAnalysis type that corresponds to an existing " + "automated workflow module must be input as a kwarg. ex: " + "ensemble_analysis='DihedralAnalysis'"): + base.automated_project_analysis(project_paths, ensemble_analysis='DarthVaderAnalysis', solvents=('water',)) + + # test logger error recording + assert "'DarthVaderAnalysis' is an invalid selection" in caplog.text, ('did not catch incorrect ' + 'key specification for workflows.registry that results in KeyError') diff --git a/mdpow/tests/testing_resources/states/workflows/project_paths.csv b/mdpow/tests/testing_resources/states/workflows/project_paths.csv new file mode 100644 index 00000000..bd48239f --- /dev/null +++ b/mdpow/tests/testing_resources/states/workflows/project_paths.csv @@ -0,0 +1,3 @@ +molecule,resname,path +SM25,SM25,mdpow/tests/testing_resources/states/workflows/SM25 +SM26,SM26,mdpow/tests/testing_resources/states/workflows/SM26 diff --git a/mdpow/workflows/base.py b/mdpow/workflows/base.py new file mode 100644 index 00000000..6e7018b7 --- /dev/null +++ b/mdpow/workflows/base.py @@ -0,0 +1,180 @@ +# MDPOW: base.py +# 2022 Cade Duckworth + +""" +:mod:`mdpow.workflows.base` --- Automated workflow base functions +================================================================= + +To analyze multiple MDPOW projects, provide :func:`project_paths` +with the top-level directory containing all MDPOW projects' simulation data +to obtain a :class:`pandas.DataFrame` containing the project information +and paths. Then, :func:`automated_project_analysis` takes as input the +aforementioned :class:`pandas.DataFrame` and runs the specified +:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` for all MDPOW projects +under the top-level directory provided to :func:`project_paths`. + +.. seealso:: :mod:`~mdpow.workflows.registry` + +.. autofunction:: project_paths +.. autofunction:: automated_project_analysis + +""" + +import os +import re +import pandas as pd + +from mdpow.workflows import registry + +import logging + +logger = logging.getLogger('mdpow.workflows.base') + +def project_paths(parent_directory=None, csv=None, csv_save_dir=None): + """Takes a top directory containing MDPOW projects and determines + the molname, resname, and path, of each MDPOW project within. + + Optionally takes a .csv file containing `molname`, `resname`, and + `paths`, in that order. + + :keywords: + + *parent_directory* + the path for the location of the top directory + under which the subdirectories of MDPOW simulation + data exist, additionally creates a 'project_paths.csv' file + for user manipulation of metadata and for future reference + + *csv* + .csv file containing the molecule names, resnames, + and paths, in that order, for the MDPOW simulation + data to be iterated over must contain header of the + form: `molecule,resname,path` + + *csv_save_dir* + optionally provided directory to save .csv file, otherwise, + data will be saved in current working directory + + :returns: + + *project_paths* + :class:`pandas.DataFrame` containing MDPOW project metadata + + .. rubric:: Example + + Typical Workflow:: + + project_paths = project_paths(parent_directory='/foo/bar/MDPOW_projects') + automated_project_analysis(project_paths) + + or:: + + project_paths = project_paths(csv='/foo/bar/MDPOW.csv') + automated_project_analysis(project_paths) + + """ + + if parent_directory is not None: + + locations = [] + + reg_compile = re.compile('FEP') + for dirpath, dirnames, filenames in os.walk(parent_directory): + result = [dirpath.strip() for dirname in dirnames if reg_compile.match(dirname)] + if result: + locations.append(result[0]) + + resnames = [] + + for loc in locations: + res_temp = loc.strip().split('/') + resnames.append(res_temp[-1]) + + project_paths = pd.DataFrame( + { + 'molecule': resnames, + 'resname': resnames, + 'path': locations + } + ) + if csv_save_dir is not None: + project_paths.to_csv(f'{csv_save_dir}/project_paths.csv', index=False) + logger.info(f'project_paths saved under {csv_save_dir}') + else: + current_directory = os.getcwd() + project_paths.to_csv('project_paths.csv', index=False) + logger.info(f'project_paths saved under {current_directory}') + + elif csv is not None: + locations = pd.read_csv(csv) + project_paths = locations.sort_values(by=['molecule', 'resname', 'path']).reset_index(drop=True) + + return project_paths + +def automated_project_analysis(project_paths, ensemble_analysis, **kwargs): + """Takes a :class:`pandas.DataFrame` created by :func:`~mdpow.workflows.base.project_paths` + and iteratively runs the specified :class:`~mdpow.analysis.ensemble.EnsembleAnalysis` + for each of the projects by running the associated automated workflow + in each project directory returned by :func:`~mdpow.workflows.base.project_paths`. + + Compatibility with more automated analyses in development. + + :keywords: + + *project_paths* + :class:`pandas.DataFrame` that provides paths to MDPOW projects + + *ensemble_analysis* + name of the :class:`~mdpow.analysis.ensemble.EnsembleAnalysis` + that corresponds to the desired automated workflow module + + *kwargs* + keyword arguments for the supported automated workflows, + see the :mod:`~mdpow.workflows.registry` for all available + workflows and their call signatures + + .. rubric:: Example + + A typical workflow is the automated dihedral analysis from + :mod:`mdpow.workflows.dihedrals`, which applies the *ensemble analysis* + :class:`~mdpow.analysis.dihedral.DihedralAnalysis` to each project. + The :data:`~mdpow.workflows.registry.registry` contains this automated + workflow under the key *"DihedralAnalysis"* and so the automated execution + for all `project_paths` (obtained via :func:`project_paths`) is performed by + passing the specific key to :func:`automated_project_analysis`:: + + project_paths = project_paths(parent_directory='/foo/bar/MDPOW_projects') + automated_project_analysis(project_paths, ensemble_analysis='DihedralAnalysis', **kwargs) + + """ + + for row in project_paths.itertuples(): + molname = row.molecule + resname = row.resname + dirname = row.path + + logger.info(f'starting {molname}') + + try: + registry.registry[ensemble_analysis](dirname=dirname, resname=resname, molname=molname, **kwargs) + + logger.info(f'{molname} completed') + + except KeyError as err: + msg = (f"Invalid ensemble_analysis {err}. An EnsembleAnalysis type that corresponds " + "to an existing automated workflow module must be input as a kwarg. " + "ex: ensemble_analysis='DihedralAnalysis'") + logger.error(f'{err} is an invalid selection') + + raise KeyError(msg) + + except TypeError as err: + msg = (f"Invalid ensemble_analysis {ensemble_analysis}. An EnsembleAnalysis type that " + "corresponds to an existing automated workflow module must be input as a kwarg. " + "ex: ensemble_analysis='DihedralAnalysis'") + logger.error(f'workflow module for {ensemble_analysis} does not exist yet') + + raise TypeError(msg) + + logger.info('all analyses completed') + return diff --git a/mdpow/workflows/registry.py b/mdpow/workflows/registry.py new file mode 100644 index 00000000..aee129d9 --- /dev/null +++ b/mdpow/workflows/registry.py @@ -0,0 +1,53 @@ +# MDPOW: registry.py +# 2023 Cade Duckworth + +""" +:mod:`mdpow.workflows.registry` --- Registry of currently supported automated workflows +======================================================================================= + +The :mod:`mdpow.workflows.registry` module hosts a dictionary with keys that correspond to an +:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` for which exists a corresponding automated workflow. + +.. table:: Currently supported automated workflows. + :widths: auto + :name: workflows_registry + + +-------------------------------+------------------------------------------------------------------------------------------------------+ + | key/keyword: EnsembleAnalysis | value: . | + +===============================+======================================================================================================+ + | DihedralAnalysis | :any:`dihedrals.automated_dihedral_analysis ` | + +-------------------------------+------------------------------------------------------------------------------------------------------+ + +.. autodata:: registry + +.. seealso:: :mod:`~mdpow.workflows.base` + +""" + +# import analysis +from mdpow.workflows import dihedrals + +registry = { + + 'DihedralAnalysis' : dihedrals.automated_dihedral_analysis + +} + +""" +In the `registry`, each entry corresponds to an +:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` +for which exists a corresponding automated workflow. + +Intended for use with :mod:`mdpow.workflows.base` to specify which +:class:`~mdpow.analysis.ensemble.EnsembleAnalysis` should run iteratively over +the provided project data directory. + +To include a new automated workflow for use with :mod:`mdpow.workflows.base`, +create a key that is the name of the corresponding +:class:`~mdpow.analysis.ensemble.EnsembleAnalysis`, with the value defined as +`.`. + +The available automated workflows (key-value pairs) are listed in the +following table :any:`Currently supported automated workflows. ` + +""" \ No newline at end of file