Converted manifest to single list of files

smart-on-fhir · Nov 18, 2024 · f551a08 · f551a08
1 parent 2528c51
commit f551a08
Show file tree

Hide file tree

Showing 18 changed files with 188 additions and 212 deletions.
diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py
@@ -135,60 +135,14 @@ def run_protected_table_builder(
     )
 
 
-def run_table_builder(
-    config: base_utils.StudyConfig,
-    manifest: study_manifest.StudyManifest,
-    *,
-    db_parser: databases.DatabaseParser = None,
-) -> None:
-    """Loads modules from a manifest and executes code via BaseTableBuilder
-
-    :param config: a StudyConfig object
-    :param manifest: a StudyManifest object
-    :keyword db_parser: an object implementing DatabaseParser for the target database
-    """
-    for file in manifest.get_table_builder_file_list():
-        _load_and_execute_builder(
-            config=config,
-            manifest=manifest,
-            filename=file,
-            db_parser=db_parser,
-        )
-
-
-def run_counts_builders(
-    config: base_utils.StudyConfig,
-    manifest: study_manifest.StudyManifest,
-) -> None:
-    """Loads counts modules from a manifest and executes code via BaseTableBuilder
-
-    While a count is a form of statistics, it is treated separately from other
-    statistics because it is, by design, always going to be static against a
-    given dataset, where other statistical methods may use sampling techniques
-    or adjustable input parameters that may need to be preserved for later review.
-
-    :param config: a StudyConfig object
-    :param manifest: a StudyManifest object
-    """
-    for file in manifest.get_counts_builder_file_list():
-        _load_and_execute_builder(
-            config=config,
-            manifest=manifest,
-            filename=file,
-        )
-
-
-def run_statistics_builders(
-    config: base_utils.StudyConfig,
-    manifest: study_manifest.StudyManifest,
+def _run_workflow(
+    config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str
 ) -> None:
-    """Loads statistics modules from toml definitions and executes
+    """Loads workflow config from toml definitions and executes workflow
 
     :param config: a StudyConfig object
     :param manifest: a StudyManifest object
     """
-    if len(manifest.get_statistics_file_list()) == 0:
-        return
     existing_stats = []
     if not config.stats_build:
         existing_stats = (
@@ -199,40 +153,41 @@ def run_statistics_builders(
             )
             .fetchall()
         )
-    for file in manifest.get_statistics_file_list():
-        # This open is a bit redundant with the open inside of the PSM builder,
-        # but we're letting it slide so that builders function similarly
-        # across the board
-        safe_timestamp = base_utils.get_tablename_safe_iso_timestamp()
-        toml_path = pathlib.Path(f"{manifest._study_path}/{file}")
-        with open(toml_path, "rb") as file:
-            stats_config = tomllib.load(file)
-            config_type = stats_config["config_type"]
-            target_table = stats_config.get("target_table", stats_config.get("table_prefix", ""))
-
-        if (target_table,) in existing_stats and not config.stats_build:
-            continue
-        if config_type == "psm":
+    # This open is a bit redundant with the open inside of the PSM builder,
+    # but we're letting it slide so that builders function similarly
+    # across the board
+    safe_timestamp = base_utils.get_tablename_safe_iso_timestamp()
+    toml_path = pathlib.Path(f"{manifest._study_path}/{filename}")
+    with open(toml_path, "rb") as file:
+        workflow_config = tomllib.load(file)
+        config_type = workflow_config["config_type"]
+        target_table = workflow_config.get("target_table", workflow_config.get("table_prefix", ""))
+
+    if (target_table,) in existing_stats and not config.stats_build:
+        return
+    match config_type:
+        case "psm":
             builder = psm_builder.PsmBuilder(
                 toml_config_path=toml_path,
-                config=stats_config,
+                config=workflow_config,
                 data_path=manifest.data_path / f"{manifest.get_study_prefix()}/psm",
             )
-        elif config_type == "valueset":
+        case "valueset":
             builder = valueset_builder.ValuesetBuilder(
                 toml_config_path=toml_path,
-                config=stats_config,
+                config=workflow_config,
                 data_path=manifest.data_path / f"{manifest.get_study_prefix()}/valueset",
             )
-        else:
+        case _:
             raise errors.StudyManifestParsingError(  # pragma: no cover
-                f"{toml_path} references an invalid statistics type {config_type}."
+                f"{toml_path} references an invalid workflow type {config_type}."
             )
-        builder.execute_queries(
-            config=config,
-            manifest=manifest,
-            table_suffix=safe_timestamp,
-        )
+    builder.execute_queries(
+        config=config,
+        manifest=manifest,
+        table_suffix=safe_timestamp,
+    )
+    if config_type in set(item.value for item in enums.StatisticsTypes):
         log_utils.log_statistics(
             config=config,
             manifest=manifest,
@@ -242,7 +197,7 @@ def run_statistics_builders(
         )
 
 
-def run_matching_table_builder(
+def build_matching_files(
     config: base_utils.StudyConfig,
     manifest: study_manifest.StudyManifest,
     *,
@@ -256,34 +211,52 @@ def run_matching_table_builder(
     :keyword builder: filename of a module implementing a TableBuilder
     :keyword db_parser: an object implementing DatabaseParser for the target database"""
     all_generators = manifest.get_all_generators()
+    matches = []
     for file in all_generators:
-        if builder and file.find(builder) == -1:
-            continue
-        _load_and_execute_builder(
-            config=config,
-            manifest=manifest,
-            filename=file,
-            db_parser=db_parser,
-        )
+        if builder and file.find(builder) != -1:
+            matches.append(file)
+    build_study(config, manifest, db_parser=db_parser, file_list=matches)
 
 
 def build_study(
     config: base_utils.StudyConfig,
     manifest: study_manifest.StudyManifest,
     *,
+    db_parser: databases.DatabaseParser = None,
     continue_from: str | None = None,
+    file_list: list | None = None,
 ) -> list:
     """Creates tables in the schema by iterating through the sql_config.file_names
 
     :param config: a StudyConfig object
     :param manifest: a StudyManifest object
-    :keyword continue_from: Name of a sql file to resume table creation from
+    :keyword continue_from: Name of a file to resume table creation from
     :returns: loaded queries (for unit testing only)
     """
+    if file_list is None:
+        file_list = manifest.get_file_list(continue_from)
+    for file in file_list:
+        if file.endswith(".py"):
+            _load_and_execute_builder(
+                config=config,
+                manifest=manifest,
+                filename=file,
+                db_parser=db_parser,
+            )
+        elif file.endswith(".toml"):
+            _run_workflow(config=config, manifest=manifest, filename=file)
+        elif file.endswith(".sql"):
+            _run_raw_queries(config=config, manifest=manifest, filename=file)
+        else:
+            raise errors.StudyManifestParsingError
+
+
+def _run_raw_queries(
+    config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str
+):
     queries = []
-    for file in manifest.get_sql_file_list(continue_from):
-        for query in base_utils.parse_sql(base_utils.load_text(f"{manifest._study_path}/{file}")):
-            queries.append([query, file])
+    for query in base_utils.parse_sql(base_utils.load_text(f"{manifest._study_path}/{filename}")):
+        queries.append([query, filename])
     if len(queries) == 0:
         return []
     for query in queries:
@@ -298,7 +271,7 @@ def build_study(
     # We want to only show a progress bar if we are :not: printing SQL lines
     with base_utils.get_progress_bar(disable=config.verbose) as progress:
         task = progress.add_task(
-            f"Creating {manifest.get_study_prefix()} study in db...",
+            f"Building tables from {filename}...",
             total=len(queries),
             visible=not config.verbose,
         )

diff --git a/cumulus_library/builders/protected_table_builder.py b/cumulus_library/builders/protected_table_builder.py
@@ -1,5 +1,8 @@
 """Builder for creating tables for tracking state/logging changes"""
 
+import pathlib
+import tomllib
+
 from cumulus_library import (
     BaseTableBuilder,
     base_utils,
@@ -64,12 +67,22 @@ def prepare_queries(
                 TRANSACTION_COLS_TYPES,
             )
         )
-        if manifest._study_config.get("statistics_config"):
-            self.queries.append(
-                base_templates.get_ctas_empty_query(
-                    db_schema,
-                    statistics,
-                    STATISTICS_COLS,
-                    STATISTICS_COLS_TYPES,
-                )
-            )
+        files = manifest.get_file_list()
+        files = [file for file in files if file.endswith(".toml")]
+        if len(files) == 0:
+            return
+        stats_types = set(item.value for item in enums.StatisticsTypes)
+        for file in files:
+            toml_path = pathlib.Path(f"{manifest._study_path}/{file}")
+            with open(toml_path, "rb") as file:
+                workflow_config = tomllib.load(file)
+                if workflow_config["config_type"] in stats_types:
+                    self.queries.append(
+                        base_templates.get_ctas_empty_query(
+                            db_schema,
+                            statistics,
+                            STATISTICS_COLS,
+                            STATISTICS_COLS_TYPES,
+                        )
+                    )
+                    return
diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py
@@ -113,7 +113,6 @@ def clean_and_build_study(
                     config=self.get_config(manifest),
                     manifest=manifest,
                 )
-                builder.run_table_builder(config=self.get_config(manifest), manifest=manifest)
 
             else:
                 log_utils.log_transaction(
@@ -127,11 +126,6 @@ def clean_and_build_study(
                 manifest=manifest,
                 continue_from=continue_from,
             )
-            builder.run_counts_builders(config=self.get_config(manifest), manifest=manifest)
-            builder.run_statistics_builders(
-                config=self.get_config(manifest),
-                manifest=manifest,
-            )
             log_utils.log_transaction(
                 config=self.get_config(manifest),
                 manifest=manifest,
@@ -150,7 +144,7 @@ def clean_and_build_study(
             )
             raise e
 
-    def run_matching_table_builder(
+    def build_matching_files(
         self,
         target: pathlib.Path,
         table_builder_name: str,
@@ -164,7 +158,7 @@ def run_matching_table_builder(
         :param options: The dictionary of study-specific options
         """
         manifest = study_manifest.StudyManifest(target, options=options)
-        builder.run_matching_table_builder(
+        builder.build_matching_files(
             config=self.get_config(manifest),
             manifest=manifest,
             builder=table_builder_name,
@@ -330,7 +324,7 @@ def run_cli(args: dict):
             elif args["action"] == "build":
                 for target in args["target"]:
                     if args["builder"]:
-                        runner.run_matching_table_builder(
+                        runner.build_matching_files(
                             study_dict[target], args["builder"], options=args["options"]
                         )
                     else:

diff --git a/cumulus_library/enums.py b/cumulus_library/enums.py
@@ -18,6 +18,12 @@ class ProtectedTables(enum.Enum):
     TRANSACTIONS = "lib_transactions"
 
 
+class StatisticsTypes(enum.Enum):
+    """A subset of workflows that create statistics sampling artifacts"""
+
+    PSM = "psm"
+
+
 class LogStatuses(enum.Enum):
     DEBUG = "debug"
     ERROR = "error"

diff --git a/cumulus_library/study_manifest.py b/cumulus_library/study_manifest.py
@@ -91,6 +91,31 @@ def get_dedicated_schema(self) -> str | None:
         options = self._study_config.get("advanced_options", {})
         return options.get("dedicated_schema")
 
+    def get_file_list(self, continue_from: str | None = None) -> list[str] | None:
+        """Reads the contents of the file_config array from the manifest
+
+        :returns: An array of files from the manifest, or None if not found.
+        """
+        config = self._study_config.get("file_config", {})
+        files = config.get("file_names", []) or []
+        if not files:
+            files = (
+                self.get_table_builder_file_list()
+                + self.get_sql_file_list()
+                + self.get_counts_builder_file_list()
+                + self.get_statistics_file_list()
+            )
+        if continue_from:
+            for pos, file in enumerate(files):
+                if continue_from.split(".", 1)[0] == file.split(".", 1)[0]:
+                    files = files[pos:]
+                    break
+            else:
+                raise errors.StudyManifestParsingError(f"No files matching '{continue_from}' found")
+        return files
+
+    # The following four functions are considered deprecated, and can be removed
+    # after we update studies to use the new methodology
     def get_sql_file_list(self, continue_from: str | None = None) -> list[str] | None:
         """Reads the contents of the sql_config array from the manifest
 
@@ -134,6 +159,8 @@ def get_statistics_file_list(self) -> list[str] | None:
         stats_config = self._study_config.get("statistics_config", {})
         return stats_config.get("file_names", [])
 
+    # End of deprecated section
+
     def get_export_table_list(self) -> list[ManifestExport] | None:
         """Reads the contents of the export_list array from the manifest
 
@@ -179,11 +206,13 @@ def get_export_table_list(self) -> list[ManifestExport] | None:
 
     def get_all_generators(self) -> list[str]:
         """Convenience method for getting files that generate sql queries"""
-        return (
-            self.get_table_builder_file_list()
-            + self.get_counts_builder_file_list()
-            + self.get_statistics_file_list()
-        )
+        files = self.get_file_list()
+        return [file for file in files if file.endswith(".py")]
+
+    def get_all_workflows(self) -> list[str]:
+        """Convenience method for getting config files"""
+        files = self.get_file_list()
+        return [file for file in files if file.endswith(".toml")]
 
     def get_prefix_with_seperator(self) -> str:
         """Convenience method for getting the appropriate prefix for tables"""