Skip to content

Commit

Permalink
Converted manifest to single list of files
Browse files Browse the repository at this point in the history
  • Loading branch information
dogversioning committed Nov 18, 2024
1 parent 2528c51 commit f551a08
Show file tree
Hide file tree
Showing 18 changed files with 188 additions and 212 deletions.
147 changes: 60 additions & 87 deletions cumulus_library/actions/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,60 +135,14 @@ def run_protected_table_builder(
)


def run_table_builder(
config: base_utils.StudyConfig,
manifest: study_manifest.StudyManifest,
*,
db_parser: databases.DatabaseParser = None,
) -> None:
"""Loads modules from a manifest and executes code via BaseTableBuilder
:param config: a StudyConfig object
:param manifest: a StudyManifest object
:keyword db_parser: an object implementing DatabaseParser for the target database
"""
for file in manifest.get_table_builder_file_list():
_load_and_execute_builder(
config=config,
manifest=manifest,
filename=file,
db_parser=db_parser,
)


def run_counts_builders(
config: base_utils.StudyConfig,
manifest: study_manifest.StudyManifest,
) -> None:
"""Loads counts modules from a manifest and executes code via BaseTableBuilder
While a count is a form of statistics, it is treated separately from other
statistics because it is, by design, always going to be static against a
given dataset, where other statistical methods may use sampling techniques
or adjustable input parameters that may need to be preserved for later review.
:param config: a StudyConfig object
:param manifest: a StudyManifest object
"""
for file in manifest.get_counts_builder_file_list():
_load_and_execute_builder(
config=config,
manifest=manifest,
filename=file,
)


def run_statistics_builders(
config: base_utils.StudyConfig,
manifest: study_manifest.StudyManifest,
def _run_workflow(
config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str
) -> None:
"""Loads statistics modules from toml definitions and executes
"""Loads workflow config from toml definitions and executes workflow
:param config: a StudyConfig object
:param manifest: a StudyManifest object
"""
if len(manifest.get_statistics_file_list()) == 0:
return
existing_stats = []
if not config.stats_build:
existing_stats = (
Expand All @@ -199,40 +153,41 @@ def run_statistics_builders(
)
.fetchall()
)
for file in manifest.get_statistics_file_list():
# This open is a bit redundant with the open inside of the PSM builder,
# but we're letting it slide so that builders function similarly
# across the board
safe_timestamp = base_utils.get_tablename_safe_iso_timestamp()
toml_path = pathlib.Path(f"{manifest._study_path}/{file}")
with open(toml_path, "rb") as file:
stats_config = tomllib.load(file)
config_type = stats_config["config_type"]
target_table = stats_config.get("target_table", stats_config.get("table_prefix", ""))

if (target_table,) in existing_stats and not config.stats_build:
continue
if config_type == "psm":
# This open is a bit redundant with the open inside of the PSM builder,
# but we're letting it slide so that builders function similarly
# across the board
safe_timestamp = base_utils.get_tablename_safe_iso_timestamp()
toml_path = pathlib.Path(f"{manifest._study_path}/{filename}")
with open(toml_path, "rb") as file:
workflow_config = tomllib.load(file)
config_type = workflow_config["config_type"]
target_table = workflow_config.get("target_table", workflow_config.get("table_prefix", ""))

if (target_table,) in existing_stats and not config.stats_build:
return
match config_type:
case "psm":
builder = psm_builder.PsmBuilder(
toml_config_path=toml_path,
config=stats_config,
config=workflow_config,
data_path=manifest.data_path / f"{manifest.get_study_prefix()}/psm",
)
elif config_type == "valueset":
case "valueset":
builder = valueset_builder.ValuesetBuilder(
toml_config_path=toml_path,
config=stats_config,
config=workflow_config,
data_path=manifest.data_path / f"{manifest.get_study_prefix()}/valueset",
)
else:
case _:
raise errors.StudyManifestParsingError( # pragma: no cover
f"{toml_path} references an invalid statistics type {config_type}."
f"{toml_path} references an invalid workflow type {config_type}."
)
builder.execute_queries(
config=config,
manifest=manifest,
table_suffix=safe_timestamp,
)
builder.execute_queries(
config=config,
manifest=manifest,
table_suffix=safe_timestamp,
)
if config_type in set(item.value for item in enums.StatisticsTypes):
log_utils.log_statistics(
config=config,
manifest=manifest,
Expand All @@ -242,7 +197,7 @@ def run_statistics_builders(
)


def run_matching_table_builder(
def build_matching_files(
config: base_utils.StudyConfig,
manifest: study_manifest.StudyManifest,
*,
Expand All @@ -256,34 +211,52 @@ def run_matching_table_builder(
:keyword builder: filename of a module implementing a TableBuilder
:keyword db_parser: an object implementing DatabaseParser for the target database"""
all_generators = manifest.get_all_generators()
matches = []
for file in all_generators:
if builder and file.find(builder) == -1:
continue
_load_and_execute_builder(
config=config,
manifest=manifest,
filename=file,
db_parser=db_parser,
)
if builder and file.find(builder) != -1:
matches.append(file)
build_study(config, manifest, db_parser=db_parser, file_list=matches)


def build_study(
config: base_utils.StudyConfig,
manifest: study_manifest.StudyManifest,
*,
db_parser: databases.DatabaseParser = None,
continue_from: str | None = None,
file_list: list | None = None,
) -> list:
"""Creates tables in the schema by iterating through the sql_config.file_names
:param config: a StudyConfig object
:param manifest: a StudyManifest object
:keyword continue_from: Name of a sql file to resume table creation from
:keyword continue_from: Name of a file to resume table creation from
:returns: loaded queries (for unit testing only)
"""
if file_list is None:
file_list = manifest.get_file_list(continue_from)
for file in file_list:
if file.endswith(".py"):
_load_and_execute_builder(
config=config,
manifest=manifest,
filename=file,
db_parser=db_parser,
)
elif file.endswith(".toml"):
_run_workflow(config=config, manifest=manifest, filename=file)
elif file.endswith(".sql"):
_run_raw_queries(config=config, manifest=manifest, filename=file)
else:
raise errors.StudyManifestParsingError


def _run_raw_queries(
config: base_utils.StudyConfig, manifest: study_manifest.StudyManifest, filename: str
):
queries = []
for file in manifest.get_sql_file_list(continue_from):
for query in base_utils.parse_sql(base_utils.load_text(f"{manifest._study_path}/{file}")):
queries.append([query, file])
for query in base_utils.parse_sql(base_utils.load_text(f"{manifest._study_path}/{filename}")):
queries.append([query, filename])
if len(queries) == 0:
return []
for query in queries:
Expand All @@ -298,7 +271,7 @@ def build_study(
# We want to only show a progress bar if we are :not: printing SQL lines
with base_utils.get_progress_bar(disable=config.verbose) as progress:
task = progress.add_task(
f"Creating {manifest.get_study_prefix()} study in db...",
f"Building tables from {filename}...",
total=len(queries),
visible=not config.verbose,
)
Expand Down
31 changes: 22 additions & 9 deletions cumulus_library/builders/protected_table_builder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
"""Builder for creating tables for tracking state/logging changes"""

import pathlib
import tomllib

from cumulus_library import (
BaseTableBuilder,
base_utils,
Expand Down Expand Up @@ -64,12 +67,22 @@ def prepare_queries(
TRANSACTION_COLS_TYPES,
)
)
if manifest._study_config.get("statistics_config"):
self.queries.append(
base_templates.get_ctas_empty_query(
db_schema,
statistics,
STATISTICS_COLS,
STATISTICS_COLS_TYPES,
)
)
files = manifest.get_file_list()
files = [file for file in files if file.endswith(".toml")]
if len(files) == 0:
return
stats_types = set(item.value for item in enums.StatisticsTypes)
for file in files:
toml_path = pathlib.Path(f"{manifest._study_path}/{file}")
with open(toml_path, "rb") as file:
workflow_config = tomllib.load(file)
if workflow_config["config_type"] in stats_types:
self.queries.append(
base_templates.get_ctas_empty_query(
db_schema,
statistics,
STATISTICS_COLS,
STATISTICS_COLS_TYPES,
)
)
return
12 changes: 3 additions & 9 deletions cumulus_library/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ def clean_and_build_study(
config=self.get_config(manifest),
manifest=manifest,
)
builder.run_table_builder(config=self.get_config(manifest), manifest=manifest)

else:
log_utils.log_transaction(
Expand All @@ -127,11 +126,6 @@ def clean_and_build_study(
manifest=manifest,
continue_from=continue_from,
)
builder.run_counts_builders(config=self.get_config(manifest), manifest=manifest)
builder.run_statistics_builders(
config=self.get_config(manifest),
manifest=manifest,
)
log_utils.log_transaction(
config=self.get_config(manifest),
manifest=manifest,
Expand All @@ -150,7 +144,7 @@ def clean_and_build_study(
)
raise e

def run_matching_table_builder(
def build_matching_files(
self,
target: pathlib.Path,
table_builder_name: str,
Expand All @@ -164,7 +158,7 @@ def run_matching_table_builder(
:param options: The dictionary of study-specific options
"""
manifest = study_manifest.StudyManifest(target, options=options)
builder.run_matching_table_builder(
builder.build_matching_files(
config=self.get_config(manifest),
manifest=manifest,
builder=table_builder_name,
Expand Down Expand Up @@ -330,7 +324,7 @@ def run_cli(args: dict):
elif args["action"] == "build":
for target in args["target"]:
if args["builder"]:
runner.run_matching_table_builder(
runner.build_matching_files(
study_dict[target], args["builder"], options=args["options"]
)
else:
Expand Down
6 changes: 6 additions & 0 deletions cumulus_library/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ class ProtectedTables(enum.Enum):
TRANSACTIONS = "lib_transactions"


class StatisticsTypes(enum.Enum):
"""A subset of workflows that create statistics sampling artifacts"""

PSM = "psm"


class LogStatuses(enum.Enum):
DEBUG = "debug"
ERROR = "error"
Expand Down
39 changes: 34 additions & 5 deletions cumulus_library/study_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,31 @@ def get_dedicated_schema(self) -> str | None:
options = self._study_config.get("advanced_options", {})
return options.get("dedicated_schema")

def get_file_list(self, continue_from: str | None = None) -> list[str] | None:
"""Reads the contents of the file_config array from the manifest
:returns: An array of files from the manifest, or None if not found.
"""
config = self._study_config.get("file_config", {})
files = config.get("file_names", []) or []
if not files:
files = (
self.get_table_builder_file_list()
+ self.get_sql_file_list()
+ self.get_counts_builder_file_list()
+ self.get_statistics_file_list()
)
if continue_from:
for pos, file in enumerate(files):
if continue_from.split(".", 1)[0] == file.split(".", 1)[0]:
files = files[pos:]
break
else:
raise errors.StudyManifestParsingError(f"No files matching '{continue_from}' found")
return files

# The following four functions are considered deprecated, and can be removed
# after we update studies to use the new methodology
def get_sql_file_list(self, continue_from: str | None = None) -> list[str] | None:
"""Reads the contents of the sql_config array from the manifest
Expand Down Expand Up @@ -134,6 +159,8 @@ def get_statistics_file_list(self) -> list[str] | None:
stats_config = self._study_config.get("statistics_config", {})
return stats_config.get("file_names", [])

# End of deprecated section

def get_export_table_list(self) -> list[ManifestExport] | None:
"""Reads the contents of the export_list array from the manifest
Expand Down Expand Up @@ -179,11 +206,13 @@ def get_export_table_list(self) -> list[ManifestExport] | None:

def get_all_generators(self) -> list[str]:
"""Convenience method for getting files that generate sql queries"""
return (
self.get_table_builder_file_list()
+ self.get_counts_builder_file_list()
+ self.get_statistics_file_list()
)
files = self.get_file_list()
return [file for file in files if file.endswith(".py")]

def get_all_workflows(self) -> list[str]:
"""Convenience method for getting config files"""
files = self.get_file_list()
return [file for file in files if file.endswith(".toml")]

def get_prefix_with_seperator(self) -> str:
"""Convenience method for getting the appropriate prefix for tables"""
Expand Down
Loading

0 comments on commit f551a08

Please sign in to comment.