From 811f514dc98989a6db58f11c91a13e9100d70e93 Mon Sep 17 00:00:00 2001 From: Matt Garber Date: Mon, 5 Feb 2024 13:32:32 -0500 Subject: [PATCH] Uprev to 3.11, namespace clarification --- .github/workflows/ci.yaml | 12 ++--- cumulus_library/base_table_builder.py | 8 ++-- cumulus_library/{helper.py => base_utils.py} | 0 cumulus_library/cli.py | 8 ++-- cumulus_library/parsers/fhir_valueset.py | 8 ++-- cumulus_library/protected_table_builder.py | 17 +++---- cumulus_library/statistics/psm.py | 40 ++++++++--------- .../studies/core/builder_condition.py | 10 ++--- .../studies/core/builder_documentreference.py | 6 +-- .../studies/core/builder_encounter.py | 7 ++- .../studies/core/builder_medication.py | 24 +++++----- .../studies/core/builder_medicationrequest.py | 6 +-- .../studies/core/builder_observation.py | 37 ++++++++++------ .../studies/core/builder_patient.py | 6 +-- .../core/core_templates/core_templates.py | 4 +- .../studies/vocab/vocab_icd_builder.py | 8 ++-- cumulus_library/study_parser.py | 33 +++++++------- .../{templates.py => base_templates.py} | 8 ++-- .../template_sql/{utils.py => sql_utils.py} | 22 +++++----- cumulus_library/upload.py | 4 +- pyproject.toml | 2 +- ...te_utils.py => test_template_sql_utils.py} | 8 ++-- tests/test_templates.py | 44 +++++++++---------- 23 files changed, 167 insertions(+), 155 deletions(-) rename cumulus_library/{helper.py => base_utils.py} (100%) rename cumulus_library/template_sql/{templates.py => base_templates.py} (97%) rename cumulus_library/template_sql/{utils.py => sql_utils.py} (93%) rename tests/{test_template_utils.py => test_template_sql_utils.py} (88%) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index ffad1854..3c6619c6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -18,7 +18,7 @@ jobs: name: unit tests runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v4 @@ -35,7 +35,7 @@ jobs: lint: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install linters run: | python -m pip install --upgrade pip @@ -52,19 +52,19 @@ jobs: permissions: id-token: write # This is required for requesting the JWT steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install dependencies run: | python -m pip install --upgrade pip pip install . - # See https://github.com/aws-actions/configure-aws-credentials for configuring + # See https://github.com/Moulick/configure-multiple-aws-roles for configuring # the aws side of this - the below action is just a light wrapper - name: Configure AWS Credentials - uses: mcblair/configure-aws-profile-action@v0.1.1 + uses: Moulick/configure-multiple-aws-roles@v4 with: role-arn: arn:aws:iam::${{secrets.AWS_ACCOUNT}}:role/cumulus-library-ci region: us-east-1 - profile-name: cumulus-library-ci + profile: cumulus-library-ci - name: Rebuild tables env: PROFILE: cumulus-library-ci diff --git a/cumulus_library/base_table_builder.py b/cumulus_library/base_table_builder.py index bf6a8783..66646500 100644 --- a/cumulus_library/base_table_builder.py +++ b/cumulus_library/base_table_builder.py @@ -8,7 +8,7 @@ from typing import final from cumulus_library.databases import DatabaseCursor -from cumulus_library import helper +from cumulus_library import base_utils class BaseTableBuilder(ABC): @@ -77,7 +77,7 @@ def execute_queries( table_names.append(table_name) for table_name in table_names: cursor.execute(f"DROP TABLE IF EXISTS {table_name}") - with helper.get_progress_bar(disable=verbose) as progress: + with base_utils.get_progress_bar(disable=verbose) as progress: task = progress.add_task( self.display_text, total=len(self.queries), @@ -85,7 +85,9 @@ def execute_queries( ) for query in self.queries: try: - with helper.query_console_output(verbose, query, progress, task): + with base_utils.query_console_output( + verbose, query, progress, task + ): cursor.execute(query) except Exception as e: # pylint: disable=broad-exception-caught sys.exit(e) diff --git a/cumulus_library/helper.py b/cumulus_library/base_utils.py similarity index 100% rename from cumulus_library/helper.py rename to cumulus_library/base_utils.py diff --git a/cumulus_library/cli.py b/cumulus_library/cli.py index 6dd58afa..c5465649 100755 --- a/cumulus_library/cli.py +++ b/cumulus_library/cli.py @@ -18,12 +18,12 @@ databases, enums, errors, - helper, + base_utils, protected_table_builder, study_parser, upload, ) -from cumulus_library.template_sql import templates +from cumulus_library.template_sql import base_templates class StudyRunner: @@ -41,7 +41,7 @@ def __init__(self, db: databases.DatabaseBackend, data_path: str): def update_transactions(self, prefix: str, status: str): """Adds a record to a study's transactions table""" self.cursor.execute( - templates.get_insert_into_query( + base_templates.get_insert_into_query( f"{prefix}__{enums.ProtectedTables.TRANSACTIONS.value}", protected_table_builder.TRANSACTIONS_COLS, [ @@ -49,7 +49,7 @@ def update_transactions(self, prefix: str, status: str): prefix, __version__, status, - helper.get_utc_datetime(), + base_utils.get_utc_datetime(), ] ], {"event_time": "TIMESTAMP"}, diff --git a/cumulus_library/parsers/fhir_valueset.py b/cumulus_library/parsers/fhir_valueset.py index fb0e7dd6..a11eaccf 100644 --- a/cumulus_library/parsers/fhir_valueset.py +++ b/cumulus_library/parsers/fhir_valueset.py @@ -3,8 +3,8 @@ from fhirclient.models.coding import Coding -from cumulus_library.helper import load_json -from cumulus_library.template_sql.templates import get_create_view_query +from cumulus_library import base_utils +from cumulus_library.template_sql import base_templates def get_include_coding(valueset_json) -> List[Coding]: @@ -20,7 +20,7 @@ def get_include_coding(valueset_json) -> List[Coding]: :param valueset_json: ValueSet file, expecially those provided by NLM/ONC/VSAC :return: list of codeable concepts (system, code, display) to include """ - valueset = load_json(valueset_json) + valueset = base_utils.load_json(valueset_json) parsed = [] for include in valueset["compose"]["include"]: @@ -42,7 +42,7 @@ def create_view_sql(view_name: str, concept_list: List[Coding]) -> str: content = [] for concept in concept_list: content.append([concept.system, concept.code, concept.display]) - return get_create_view_query( + return base_templates.get_create_view_query( view_name=view_name, dataset=content, view_cols=["system", "code", "display"] ) diff --git a/cumulus_library/protected_table_builder.py b/cumulus_library/protected_table_builder.py index f7d35787..8d0a2cfe 100644 --- a/cumulus_library/protected_table_builder.py +++ b/cumulus_library/protected_table_builder.py @@ -1,10 +1,7 @@ """ Builder for creating tables for tracking state/logging changes""" -from cumulus_library.base_table_builder import BaseTableBuilder -from cumulus_library.enums import ProtectedTables -from cumulus_library.template_sql.templates import ( - get_ctas_empty_query, -) +from cumulus_library import base_table_builder, enums +from cumulus_library.template_sql import base_templates TRANSACTIONS_COLS = ["study_name", "library_version", "status", "event_time"] TRANSACTION_COLS_TYPES = ["varchar", "varchar", "varchar", "timestamp"] @@ -28,7 +25,7 @@ ] -class ProtectedTableBuilder(BaseTableBuilder): +class ProtectedTableBuilder(base_table_builder.BaseTableBuilder): """Builder for tables that persist across study clean/build actions""" display_text = "Creating/updating system tables..." @@ -43,18 +40,18 @@ def prepare_queries( **kwargs, ): self.queries.append( - get_ctas_empty_query( + base_templates.get_ctas_empty_query( schema, - f"{study_name}__{ProtectedTables.TRANSACTIONS.value}", + f"{study_name}__{enums.ProtectedTables.TRANSACTIONS.value}", TRANSACTIONS_COLS, TRANSACTION_COLS_TYPES, ) ) if study_stats: self.queries.append( - get_ctas_empty_query( + base_templates.get_ctas_empty_query( schema, - f"{study_name}__{ProtectedTables.STATISTICS.value}", + f"{study_name}__{enums.ProtectedTables.STATISTICS.value}", STATISTICS_COLS, STATISTICS_COLS_TYPES, ) diff --git a/cumulus_library/statistics/psm.py b/cumulus_library/statistics/psm.py index 3475f9c8..00a2f059 100644 --- a/cumulus_library/statistics/psm.py +++ b/cumulus_library/statistics/psm.py @@ -2,10 +2,10 @@ import json import os +import pathlib import sys import warnings -from pathlib import PosixPath from dataclasses import dataclass import pandas @@ -18,14 +18,10 @@ import matplotlib.pyplot as plt import seaborn as sns -from cumulus_library.databases import DatabaseCursor -from cumulus_library.base_table_builder import BaseTableBuilder -from cumulus_library.template_sql import templates - -from cumulus_library.template_sql.statistics.psm_templates import ( - get_distinct_ids, - get_create_covariate_table, -) +from cumulus_library import databases +from cumulus_library import base_table_builder +from cumulus_library.template_sql import base_templates +from cumulus_library.template_sql.statistics import psm_templates @dataclass @@ -56,12 +52,12 @@ class PsmConfig: seed: int -class PsmBuilder(BaseTableBuilder): +class PsmBuilder(base_table_builder.BaseTableBuilder): """TableBuilder for creating PSM tables""" display_text = "Building PSM tables..." - def __init__(self, toml_config_path: str, data_path: PosixPath): + def __init__(self, toml_config_path: str, data_path: pathlib.Path): """Loads PSM job details from a PSM configuration file""" super().__init__() # We're stashing the toml path for error reporting later @@ -75,7 +71,7 @@ def __init__(self, toml_config_path: str, data_path: PosixPath): sys.exit(f"PSM configuration not found at {self.toml_path}") try: self.config = PsmConfig( - classification_json=f"{PosixPath(self.toml_path).parent}/{toml_config['classification_json']}", + classification_json=f"{pathlib.Path(self.toml_path).parent}/{toml_config['classification_json']}", pos_source_table=toml_config["pos_source_table"], neg_source_table=toml_config["neg_source_table"], target_table=toml_config["target_table"], @@ -103,7 +99,7 @@ def _get_symptoms_dict(self, path: str) -> dict: def _get_sampled_ids( self, - cursor: DatabaseCursor, + cursor: databases.DatabaseCursor, schema: str, query: str, sample_size: int, @@ -135,12 +131,14 @@ def _get_sampled_ids( return df def _create_covariate_table( - self, cursor: DatabaseCursor, schema: str, table_suffix: str + self, cursor: databases.DatabaseCursor, schema: str, table_suffix: str ): """Creates a covariate table from the loaded toml config""" # checks for primary & link ref being the same source_refs = list({self.config.primary_ref, self.config.count_ref} - {None}) - pos_query = get_distinct_ids(source_refs, self.config.pos_source_table) + pos_query = psm_templates.get_distinct_ids( + source_refs, self.config.pos_source_table + ) pos = self._get_sampled_ids( cursor, schema, @@ -149,7 +147,7 @@ def _create_covariate_table( self.config.dependent_variable, 1, ) - neg_query = get_distinct_ids( + neg_query = psm_templates.get_distinct_ids( source_refs, self.config.neg_source_table, join_id=self.config.primary_ref, @@ -165,14 +163,14 @@ def _create_covariate_table( ) cohort = pandas.concat([pos, neg]) - ctas_query = templates.get_ctas_query_from_df( + ctas_query = base_templates.get_ctas_query_from_df( schema, f"{self.config.pos_source_table}_sampled_ids_{table_suffix}", cohort, ) self.queries.append(ctas_query) - dataset_query = get_create_covariate_table( + dataset_query = psm_templates.get_create_covariate_table( target_table=f"{self.config.target_table}_{table_suffix}", pos_source_table=self.config.pos_source_table, neg_source_table=self.config.neg_source_table, @@ -261,15 +259,15 @@ def psm_effect_size_plot( sns_plot.figure.savefig(filename, dpi=250, bbox_inches="tight") def generate_psm_analysis( - self, cursor: DatabaseCursor, schema: str, table_suffix: str + self, cursor: databases.DatabaseCursor, schema: str, table_suffix: str ): stats_table = f"{self.config.target_table}_{table_suffix}" """Runs PSM statistics on generated tables""" cursor.execute( - templates.get_alias_table_query(stats_table, self.config.target_table) + base_templates.get_alias_table_query(stats_table, self.config.target_table) ) df = cursor.execute( - templates.get_select_all_query(self.config.target_table) + base_templates.get_select_all_query(self.config.target_table) ).as_pandas() symptoms_dict = self._get_symptoms_dict(self.config.classification_json) for dependent_variable, codes in symptoms_dict.items(): diff --git a/cumulus_library/studies/core/builder_condition.py b/cumulus_library/studies/core/builder_condition.py index c0448262..afdf28c5 100644 --- a/cumulus_library/studies/core/builder_condition.py +++ b/cumulus_library/studies/core/builder_condition.py @@ -1,7 +1,7 @@ from cumulus_library import base_table_builder from cumulus_library import databases from cumulus_library.studies.core.core_templates import core_templates -from cumulus_library.template_sql import templates, utils +from cumulus_library.template_sql import base_templates, sql_utils expected_table_cols = { @@ -45,7 +45,7 @@ class CoreConditionBuilder(base_table_builder.BaseTableBuilder): display_text = "Creating Condition tables..." def denormalize_codes(self): - preferred_config = utils.CodeableConceptConfig( + preferred_config = sql_utils.CodeableConceptConfig( source_table="condition", source_id="id", column_name="code", @@ -59,10 +59,10 @@ def denormalize_codes(self): ], ) self.queries.append( - templates.get_codeable_concept_denormalize_query(preferred_config) + base_templates.get_codeable_concept_denormalize_query(preferred_config) ) - all_config = utils.CodeableConceptConfig( + all_config = sql_utils.CodeableConceptConfig( source_table="condition", source_id="id", column_name="code", @@ -71,7 +71,7 @@ def denormalize_codes(self): filter_priority=False, ) self.queries.append( - templates.get_codeable_concept_denormalize_query(all_config) + base_templates.get_codeable_concept_denormalize_query(all_config) ) def prepare_queries( diff --git a/cumulus_library/studies/core/builder_documentreference.py b/cumulus_library/studies/core/builder_documentreference.py index e31d933e..22d047d6 100644 --- a/cumulus_library/studies/core/builder_documentreference.py +++ b/cumulus_library/studies/core/builder_documentreference.py @@ -1,7 +1,7 @@ from cumulus_library import base_table_builder from cumulus_library import databases from cumulus_library.studies.core.core_templates import core_templates -from cumulus_library.template_sql import templates, utils +from cumulus_library.template_sql import sql_utils expected_table_cols = { @@ -28,11 +28,11 @@ def prepare_queries( parser: databases.DatabaseParser = None, **kwargs, ): - self.queries += utils.denormalize_codes( + self.queries += sql_utils.denormalize_codes( schema, cursor, [ - utils.CodeableConceptConfig( + sql_utils.CodeableConceptConfig( source_table="documentreference", source_id="id", column_name="type", diff --git a/cumulus_library/studies/core/builder_encounter.py b/cumulus_library/studies/core/builder_encounter.py index ec6c3a7a..6f5595c7 100644 --- a/cumulus_library/studies/core/builder_encounter.py +++ b/cumulus_library/studies/core/builder_encounter.py @@ -1,8 +1,7 @@ from cumulus_library import base_table_builder from cumulus_library import databases from cumulus_library.studies.core.core_templates import core_templates -from cumulus_library.template_sql import templates -from cumulus_library.template_sql import utils +from cumulus_library.template_sql import sql_utils expected_table_cols = { @@ -81,7 +80,7 @@ def denormalize_codes(self, schema, cursor): code_configs = [] for code_source in code_sources: code_configs.append( - utils.CodeableConceptConfig( + sql_utils.CodeableConceptConfig( source_table="encounter", source_id="id", column_name=code_source["column_name"], @@ -91,7 +90,7 @@ def denormalize_codes(self, schema, cursor): target_table=f"core__encounter_dn_{code_source['column_name']}", ) ) - self.queries += utils.denormalize_codes(schema, cursor, code_configs) + self.queries += sql_utils.denormalize_codes(schema, cursor, code_configs) def prepare_queries( self, diff --git a/cumulus_library/studies/core/builder_medication.py b/cumulus_library/studies/core/builder_medication.py index 27f16117..f61a5643 100644 --- a/cumulus_library/studies/core/builder_medication.py +++ b/cumulus_library/studies/core/builder_medication.py @@ -1,7 +1,7 @@ """ Module for generating core medication table""" -from cumulus_library import base_table_builder, helper -from cumulus_library.template_sql import templates, utils +from cumulus_library import base_table_builder, base_utils +from cumulus_library.template_sql import base_templates, sql_utils from cumulus_library.studies.core.core_templates import core_templates @@ -25,18 +25,20 @@ def _check_data_in_fields(self, cursor, schema: str): table = "medicationrequest" base_col = "medicationcodeableconcept" - with helper.get_progress_bar(transient=True) as progress: + with base_utils.get_progress_bar(transient=True) as progress: task = progress.add_task( "Detecting available medication sources...", total=7, ) # inline medications from FHIR medication - data_types["inline"] = utils.is_codeable_concept_populated( + data_types["inline"] = sql_utils.is_codeable_concept_populated( schema, table, base_col, cursor ) if data_types["inline"]: - query = templates.get_column_datatype_query(schema, table, [base_col]) + query = base_templates.get_column_datatype_query( + schema, table, [base_col] + ) cursor.execute(query) progress.advance(task) if "userselected" not in str(cursor.fetchone()[0]): @@ -46,21 +48,21 @@ def _check_data_in_fields(self, cursor, schema: str): else: has_userselected = False # Validating presence of FHIR medication requests - query = templates.get_is_table_not_empty_query( + query = base_templates.get_is_table_not_empty_query( "medicationrequest", "medicationreference" ) cursor.execute(query) progress.advance(task) if cursor.fetchone() is None: return data_types, has_userselected - query = templates.get_column_datatype_query( + query = base_templates.get_column_datatype_query( schema, "medicationrequest", ["medicationreference"] ) cursor.execute(query) progress.advance(task) if "reference" not in cursor.fetchone()[0]: return data_types, has_userselected - query = templates.get_is_table_not_empty_query( + query = base_templates.get_is_table_not_empty_query( "medicationrequest", "medicationreference.reference" ) cursor.execute(query) @@ -69,7 +71,7 @@ def _check_data_in_fields(self, cursor, schema: str): return data_types, has_userselected # checking med ref contents for our two linkage cases - query = templates.get_is_table_not_empty_query( + query = base_templates.get_is_table_not_empty_query( "medicationrequest", "medicationreference.reference", conditions=["medicationreference.reference LIKE '#%'"], @@ -78,7 +80,7 @@ def _check_data_in_fields(self, cursor, schema: str): progress.advance(task) if cursor.fetchone() is not None: data_types["by_contained_ref"] = True - query = templates.get_is_table_not_empty_query( + query = base_templates.get_is_table_not_empty_query( "medicationrequest", "medicationreference.reference", conditions=["medicationreference.reference LIKE 'Medication/%'"], @@ -116,7 +118,7 @@ def prepare_queries(self, cursor: object, schema: str, *args, **kwargs) -> dict: ) else: self.queries.append( - templates.get_ctas_empty_query( + base_templates.get_ctas_empty_query( schema, "core__medication", [ diff --git a/cumulus_library/studies/core/builder_medicationrequest.py b/cumulus_library/studies/core/builder_medicationrequest.py index 6afcffa4..3030ea7e 100644 --- a/cumulus_library/studies/core/builder_medicationrequest.py +++ b/cumulus_library/studies/core/builder_medicationrequest.py @@ -4,7 +4,7 @@ as it leverages the core__medication table for data population""" from cumulus_library import base_table_builder -from cumulus_library.template_sql import templates, utils +from cumulus_library.template_sql import base_templates, sql_utils from cumulus_library import databases from cumulus_library.studies.core.core_templates import core_templates @@ -24,7 +24,7 @@ class MedicationRequestBuilder(base_table_builder.BaseTableBuilder): display_text = "Creating MedicationRequest tables..." def denormalize_codes(self): - preferred_config = utils.CodeableConceptConfig( + preferred_config = sql_utils.CodeableConceptConfig( source_table="medicationrequest", source_id="id", column_name="category", @@ -33,7 +33,7 @@ def denormalize_codes(self): filter_priority=False, ) self.queries.append( - templates.get_codeable_concept_denormalize_query(preferred_config) + base_templates.get_codeable_concept_denormalize_query(preferred_config) ) def prepare_queries( diff --git a/cumulus_library/studies/core/builder_observation.py b/cumulus_library/studies/core/builder_observation.py index 2ec5bc39..294ba3c4 100644 --- a/cumulus_library/studies/core/builder_observation.py +++ b/cumulus_library/studies/core/builder_observation.py @@ -1,12 +1,12 @@ """ Module for extracting US core extensions from patient records""" +from dataclasses import dataclass + from cumulus_library import base_table_builder -from cumulus_library.template_sql import templates, utils +from cumulus_library.template_sql import sql_utils from cumulus_library import databases from cumulus_library.studies.core.core_templates import core_templates -CCC = utils.CodeableConceptConfig - expected_table_cols = { "observation": { "id": [], @@ -32,16 +32,12 @@ } -# TODO: upgrade to 3.10+, use kw_only flag to subclass a dataclass for generating source/target -code_sources = [ - CCC(column_name="category", is_array=True, filter_priority=False), - CCC(column_name="code", is_array=False, filter_priority=False), - CCC(column_name="interpretation", is_array=True, filter_priority=False), - CCC(column_name="valuecodeableconcept", is_array=False, filter_priority=False), -] -for source in code_sources: - source.source_table = "observation" - source.target_table = f"core__observation_dn_{source.column_name}" +@dataclass(kw_only=True) +class ObsConfig(sql_utils.CodeableConceptConfig): + source_table: str = "observation" + + def __post_init__(self): + self.target_table = f"core__observation_dn_{self.column_name}" class ObservationBuilder(base_table_builder.BaseTableBuilder): @@ -60,7 +56,20 @@ def prepare_queries( :param cursor: A database cursor object :param schema: the schema/db name, matching the cursor """ - self.queries += utils.denormalize_codes(schema, cursor, code_sources) + code_sources = [ + ObsConfig(column_name="category", is_array=True, filter_priority=False), + ObsConfig(column_name="code", is_array=False, filter_priority=False), + ObsConfig( + column_name="interpretation", is_array=True, filter_priority=False + ), + ObsConfig( + column_name="valuecodeableconcept", + is_array=False, + filter_priority=False, + ), + ] + + self.queries += sql_utils.denormalize_codes(schema, cursor, code_sources) validated_schema = core_templates.validate_schema( cursor, schema, expected_table_cols, parser ) diff --git a/cumulus_library/studies/core/builder_patient.py b/cumulus_library/studies/core/builder_patient.py index 75e40f93..8302163e 100644 --- a/cumulus_library/studies/core/builder_patient.py +++ b/cumulus_library/studies/core/builder_patient.py @@ -1,7 +1,7 @@ """ Module for extracting US core extensions from patient records""" from cumulus_library.base_table_builder import BaseTableBuilder -from cumulus_library.template_sql import templates, utils +from cumulus_library.template_sql import base_templates, sql_utils from cumulus_library import databases from cumulus_library.studies.core.core_templates import core_templates @@ -44,7 +44,7 @@ def prepare_queries( ] for extension in extension_types: - config = utils.ExtensionConfig( + config = sql_utils.ExtensionConfig( "patient", "id", f"core__patient_ext_{extension['name']}", @@ -53,7 +53,7 @@ def prepare_queries( ["ombCategory", "detailed", "text"], is_array=True, ) - self.queries.append(templates.get_extension_denormalize_query(config)) + self.queries.append(base_templates.get_extension_denormalize_query(config)) validated_schema = core_templates.validate_schema( cursor, schema, expected_table_cols, parser ) diff --git a/cumulus_library/studies/core/core_templates/core_templates.py b/cumulus_library/studies/core/core_templates/core_templates.py index 584faee9..ce4808c3 100644 --- a/cumulus_library/studies/core/core_templates/core_templates.py +++ b/cumulus_library/studies/core/core_templates/core_templates.py @@ -3,7 +3,7 @@ import jinja2 -from cumulus_library.template_sql import templates +from cumulus_library.template_sql import base_templates PATH = pathlib.Path(__file__).parent @@ -22,7 +22,7 @@ def get_core_template( def validate_schema(cursor: object, schema: str, expected_table_cols, parser): validated_schema = {} for table, cols in expected_table_cols.items(): - query = templates.get_column_datatype_query(schema, table, cols.keys()) + query = base_templates.get_column_datatype_query(schema, table, cols.keys()) table_schema = cursor.execute(query).fetchall() validated_schema[table] = parser.validate_table_schema(cols, table_schema) return validated_schema diff --git a/cumulus_library/studies/vocab/vocab_icd_builder.py b/cumulus_library/studies/vocab/vocab_icd_builder.py index 27c503af..cb9f849f 100644 --- a/cumulus_library/studies/vocab/vocab_icd_builder.py +++ b/cumulus_library/studies/vocab/vocab_icd_builder.py @@ -4,7 +4,7 @@ import pathlib from cumulus_library import base_table_builder -from cumulus_library.template_sql import templates +from cumulus_library.template_sql import base_templates class VocabIcdRunner(base_table_builder.BaseTableBuilder): @@ -49,7 +49,7 @@ def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): if not created: row = self.clean_row(next(reader), filename) self.queries.append( - templates.get_ctas_query( + base_templates.get_ctas_query( schema_name=schema, table_name=table_name, dataset=[row], @@ -63,7 +63,7 @@ def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): rows_processed += 1 if rows_processed == self.partition_size: self.queries.append( - templates.get_insert_into_query( + base_templates.get_insert_into_query( table_name=table_name, table_cols=headers, dataset=dataset, @@ -73,7 +73,7 @@ def prepare_queries(self, cursor: object, schema: str, *args, **kwargs): rows_processed = 0 if rows_processed > 0: self.queries.append( - templates.get_insert_into_query( + base_templates.get_insert_into_query( table_name=table_name, table_cols=headers, dataset=dataset ) ) diff --git a/cumulus_library/study_parser.py b/cumulus_library/study_parser.py index 17891a4d..71a6de00 100644 --- a/cumulus_library/study_parser.py +++ b/cumulus_library/study_parser.py @@ -12,8 +12,9 @@ from rich.progress import Progress, TaskID, track -from cumulus_library import __version__, helper from cumulus_library import ( + __version__, + base_utils, base_table_builder, databases, enums, @@ -21,7 +22,7 @@ protected_table_builder, ) from cumulus_library.statistics import psm -from cumulus_library.template_sql import templates +from cumulus_library.template_sql import base_templates StrList = List[str] @@ -234,8 +235,8 @@ def clean_study( if confirm is None or confirm.lower() not in ("y", "yes"): sys.exit("Table cleaning aborted") - view_sql = templates.get_show_views(schema_name, drop_prefix) - table_sql = templates.get_show_tables(schema_name, drop_prefix) + view_sql = base_templates.get_show_views(schema_name, drop_prefix) + table_sql = base_templates.get_show_tables(schema_name, drop_prefix) for query_and_type in [[view_sql, "VIEW"], [table_sql, "TABLE"]]: view_table_list = self.get_unprotected_stats_view_table( cursor, @@ -268,7 +269,7 @@ def clean_study( if confirm.lower() not in ("y", "yes"): sys.exit("Table cleaning aborted") # We want to only show a progress bar if we are :not: printing SQL lines - with helper.get_progress_bar(disable=verbose) as progress: + with base_utils.get_progress_bar(disable=verbose) as progress: task = progress.add_task( f"Removing {display_prefix} study artifacts...", total=len(view_table_list), @@ -284,7 +285,7 @@ def clean_study( # if we're doing a stats clean, we'll also remove the table containing the # list of protected tables if stats_clean: - drop_query = templates.get_drop_view_table( + drop_query = base_templates.get_drop_view_table( f"{drop_prefix}{enums.ProtectedTables.STATISTICS.value}", "TABLE" ) cursor.execute(drop_query) @@ -308,10 +309,12 @@ def _execute_drop_queries( :param task: a TaskID for a given progress bar """ for view_table in view_table_list: - drop_view_table = templates.get_drop_view_table( + drop_view_table = base_templates.get_drop_view_table( name=view_table[0], view_or_table=view_table[1] ) - with helper.query_console_output(verbose, drop_view_table, progress, task): + with base_utils.query_console_output( + verbose, drop_view_table, progress, task + ): cursor.execute(drop_view_table) def _load_and_execute_builder( @@ -462,7 +465,7 @@ def run_statistics_builders( # This open is a bit redundant with the open inside of the PSM builder, # but we're letting it slide so that builders function similarly # across the board - safe_timestamp = helper.get_tablename_safe_iso_timestamp() + safe_timestamp = base_utils.get_tablename_safe_iso_timestamp() toml_path = pathlib.Path(f"{self._study_path}/{file}") with open(toml_path, encoding="UTF-8") as file: config = toml.load(file) @@ -480,7 +483,7 @@ def run_statistics_builders( cursor, schema, verbose, table_suffix=safe_timestamp ) - insert_query = templates.get_insert_into_query( + insert_query = base_templates.get_insert_into_query( f"{self.get_study_prefix()}__{enums.ProtectedTables.STATISTICS.value}", [ "study_name", @@ -497,7 +500,7 @@ def run_statistics_builders( config_type, f"{target_table}_{safe_timestamp}", target_table, - helper.get_utc_datetime(), + base_utils.get_utc_datetime(), ] ], ) @@ -568,14 +571,14 @@ def build_study( """ queries = [] for file in self.get_sql_file_list(continue_from): - for query in helper.parse_sql( - helper.load_text(f"{self._study_path}/{file}") + for query in base_utils.parse_sql( + base_utils.load_text(f"{self._study_path}/{file}") ): queries.append([query, file]) if len(queries) == 0: return [] # We want to only show a progress bar if we are :not: printing SQL lines - with helper.get_progress_bar(disable=verbose) as progress: + with base_utils.get_progress_bar(disable=verbose) as progress: task = progress.add_task( f"Creating {self.get_study_prefix()} study in db...", total=len(queries), @@ -652,7 +655,7 @@ def _execute_build_queries( "start with a string like `study_prefix__`.", ) try: - with helper.query_console_output(verbose, query[0], progress, task): + with base_utils.query_console_output(verbose, query[0], progress, task): cursor.execute(query[0]) except Exception as e: # pylint: disable=broad-exception-caught self._query_error( diff --git a/cumulus_library/template_sql/templates.py b/cumulus_library/template_sql/base_templates.py similarity index 97% rename from cumulus_library/template_sql/templates.py rename to cumulus_library/template_sql/base_templates.py index acc8bf28..cff0f4ac 100644 --- a/cumulus_library/template_sql/templates.py +++ b/cumulus_library/template_sql/base_templates.py @@ -8,7 +8,7 @@ from pandas import DataFrame from cumulus_library import databases -from cumulus_library.template_sql import utils +from cumulus_library.template_sql import sql_utils PATH = Path(__file__).parent @@ -39,7 +39,9 @@ def get_code_system_pairs(output_table_name: str, code_system_tables: list) -> s ) -def get_codeable_concept_denormalize_query(config: utils.CodeableConceptConfig) -> str: +def get_codeable_concept_denormalize_query( + config: sql_utils.CodeableConceptConfig, +) -> str: """extracts codeable concepts from a specified table. This function is targeted at arbitrary codeableConcept elements - see @@ -172,7 +174,7 @@ def get_drop_view_table(name: str, view_or_table: str) -> str: ) -def get_extension_denormalize_query(config: utils.ExtensionConfig) -> str: +def get_extension_denormalize_query(config: sql_utils.ExtensionConfig) -> str: """extracts target extension from a table into a denormalized table This function is targeted at a complex extension element that is at the root diff --git a/cumulus_library/template_sql/utils.py b/cumulus_library/template_sql/sql_utils.py similarity index 93% rename from cumulus_library/template_sql/utils.py rename to cumulus_library/template_sql/sql_utils.py index aa5cd7b5..2664be56 100644 --- a/cumulus_library/template_sql/utils.py +++ b/cumulus_library/template_sql/sql_utils.py @@ -14,12 +14,12 @@ import duckdb from typing import List -from cumulus_library import helper -from cumulus_library.template_sql import templates +from cumulus_library import base_utils +from cumulus_library.template_sql import base_templates from cumulus_library import databases -@dataclass +@dataclass(kw_only=True) class CodeableConceptConfig: """Holds parameters for generating codableconcept tables. @@ -89,7 +89,7 @@ def _check_data_in_fields( """ - with helper.get_progress_bar(transient=True) as progress: + with base_utils.get_progress_bar(transient=True) as progress: task = progress.add_task( "Detecting available encounter codeableConcepts...", # Each column in code_sources requires at most 3 queries to @@ -119,11 +119,11 @@ def denormalize_codes( for code_source in code_sources: if code_source.has_data: queries.append( - templates.get_codeable_concept_denormalize_query(code_source) + base_templates.get_codeable_concept_denormalize_query(code_source) ) else: queries.append( - templates.get_ctas_empty_query( + base_templates.get_ctas_empty_query( schema_name=schema, table_name=code_source.target_table, table_cols=["id", "code", "code_system", "display"], @@ -157,7 +157,7 @@ def is_codeable_concept_populated( if not _check_schema_if_exists(schema, table, base_col, cursor, coding_element): return False - query = templates.get_is_table_not_empty_query( + query = base_templates.get_is_table_not_empty_query( table, "t1.row1", [ @@ -200,7 +200,7 @@ def is_codeable_concept_array_populated( try: if not _check_schema_if_exists(schema, table, base_col, cursor, coding_element): return False - query = templates.get_is_table_not_empty_query( + query = base_templates.get_is_table_not_empty_query( table, "t2.row2", [ @@ -247,7 +247,7 @@ def is_code_populated( schema, table, base_col, cursor, "coding", check_missing=True ): return False - query = templates.get_is_table_not_empty_query( + query = base_templates.get_is_table_not_empty_query( table, base_col, ) @@ -267,12 +267,12 @@ def _check_schema_if_exists( ) -> bool: """Validation check for a column existing, and having the expected schema""" try: - query = templates.get_is_table_not_empty_query(table, base_col) + query = base_templates.get_is_table_not_empty_query(table, base_col) cursor.execute(query) if cursor.fetchone() is None: return False - query = templates.get_column_datatype_query(schema, table, [base_col]) + query = base_templates.get_column_datatype_query(schema, table, [base_col]) cursor.execute(query) schema_str = str(cursor.fetchone()[1]) if check_missing: diff --git a/cumulus_library/upload.py b/cumulus_library/upload.py index 08a117be..21edbdd0 100644 --- a/cumulus_library/upload.py +++ b/cumulus_library/upload.py @@ -9,7 +9,7 @@ from pandas import read_parquet from rich.progress import Progress, TaskID -from cumulus_library.helper import get_progress_bar +from cumulus_library import base_utils def upload_data( @@ -91,7 +91,7 @@ def upload_files(args: dict): except StopIteration: version = "0" num_uploads = len(file_paths) - with get_progress_bar() as progress: + with base_utils.get_progress_bar() as progress: file_upload_progress = progress.add_task("Uploading", total=num_uploads) for file_path in file_paths: upload_data(progress, file_upload_progress, file_path, version, args) diff --git a/pyproject.toml b/pyproject.toml index 716ccd55..805041d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "cumulus-library" -requires-python = ">= 3.9" +requires-python = ">= 3.11" dependencies = [ "ctakesclient >= 1.3", "cumulus-fhir-support >= 1", diff --git a/tests/test_template_utils.py b/tests/test_template_sql_utils.py similarity index 88% rename from tests/test_template_utils.py rename to tests/test_template_sql_utils.py index 8312640b..0c44614f 100644 --- a/tests/test_template_utils.py +++ b/tests/test_template_sql_utils.py @@ -4,7 +4,7 @@ import pytest from contextlib import nullcontext as does_not_raise -from cumulus_library.template_sql import utils +from cumulus_library.template_sql import sql_utils @pytest.mark.parametrize( @@ -22,7 +22,7 @@ ) def test_is_codeable_concept_populated(mock_db, table, base_col, expected, raises): with raises: - res = utils.is_codeable_concept_populated( + res = sql_utils.is_codeable_concept_populated( "main", table, base_col, mock_db.cursor() ) assert res == expected @@ -45,7 +45,7 @@ def test_is_codeable_concept_array_populated( mock_db, table, base_col, expected, raises ): with raises: - res = utils.is_codeable_concept_array_populated( + res = sql_utils.is_codeable_concept_array_populated( "main", table, base_col, mock_db.cursor() ) assert res == expected @@ -66,5 +66,5 @@ def test_is_codeable_concept_array_populated( ) def test_is_code_populated(mock_db, table, base_col, expected, raises): with raises: - res = utils.is_code_populated("main", table, base_col, mock_db.cursor()) + res = sql_utils.is_code_populated("main", table, base_col, mock_db.cursor()) assert res == expected diff --git a/tests/test_templates.py b/tests/test_templates.py index 53904c07..6da24bcf 100644 --- a/tests/test_templates.py +++ b/tests/test_templates.py @@ -4,19 +4,19 @@ from pandas import DataFrame -from cumulus_library.template_sql import templates, utils +from cumulus_library.template_sql import base_templates, sql_utils def test_alias_table(): expected = """CREATE OR REPLACE VIEW target AS SELECT * FROM source;""" - query = templates.get_alias_table_query("source", "target") + query = base_templates.get_alias_table_query("source", "target") assert query == expected def test_select_all(): expected = """SELECT * FROM source;""" - query = templates.get_select_all_query("source") + query = base_templates.get_select_all_query("source") assert query == expected @@ -52,14 +52,14 @@ def test_codeable_concept_denormalize_all_creation(): FROM union_table ); """ - config = utils.CodeableConceptConfig( + config = sql_utils.CodeableConceptConfig( source_table="source", source_id="id", column_name="code_col", target_table="target__concepts", is_array=True, ) - query = templates.get_codeable_concept_denormalize_query(config) + query = base_templates.get_codeable_concept_denormalize_query(config) assert query == expected @@ -140,7 +140,7 @@ def test_codeable_concept_denormalize_filter_creation(): ); """ - config = utils.CodeableConceptConfig( + config = sql_utils.CodeableConceptConfig( source_table="source", source_id="id", column_name="code_col", @@ -152,7 +152,7 @@ def test_codeable_concept_denormalize_filter_creation(): "http://hl7.org/fhir/sid/icd-10-cm", ], ) - query = templates.get_codeable_concept_denormalize_query(config) + query = base_templates.get_codeable_concept_denormalize_query(config) assert query == expected @@ -167,7 +167,7 @@ def test_get_column_datatype_query(): AND table_name = 'table_name' AND LOWER(column_name) IN ('foo', 'bar') --noqa: LT05""" - query = templates.get_column_datatype_query( + query = base_templates.get_column_datatype_query( schema_name="schema_name", table_name="table_name", column_names=["foo", "bar"], @@ -185,7 +185,7 @@ def test_create_view_query_creation(): AS t ("a","b") );""" - query = templates.get_create_view_query( + query = base_templates.get_create_view_query( view_name="test_view", dataset=[["foo", "foo"], ["bar", "bar"]], view_cols=["a", "b"], @@ -227,7 +227,7 @@ def test_create_view_query_creation(): ], ) def test_ctas_empty_query_creation(expected, schema, table, cols, types): - query = templates.get_ctas_empty_query( + query = base_templates.get_ctas_empty_query( schema_name=schema, table_name=table, table_cols=cols, table_cols_types=types ) assert query == expected @@ -242,14 +242,14 @@ def test_ctas_query_creation(): ) AS t ("a","b") );""" - query = templates.get_ctas_query( + query = base_templates.get_ctas_query( schema_name="test_schema", table_name="test_table", dataset=[["foo", "foo"], ["bar", "bar"]], table_cols=["a", "b"], ) assert query == expected - query = templates.get_ctas_query_from_df( + query = base_templates.get_ctas_query_from_df( schema_name="test_schema", table_name="test_table", df=DataFrame({"a": ["foo", "bar"], "b": ["foo", "bar"]}), @@ -337,7 +337,7 @@ def test_extension_denormalize_creation(): ) WHERE available_priority = 1 );""" - config = utils.ExtensionConfig( + config = sql_utils.ExtensionConfig( "source_table", "source_id", "target_table", @@ -345,9 +345,9 @@ def test_extension_denormalize_creation(): "fhir_extension", ["omb", "text"], ) - query = templates.get_extension_denormalize_query(config) + query = base_templates.get_extension_denormalize_query(config) assert query == expected - config = utils.ExtensionConfig( + config = sql_utils.ExtensionConfig( "source_table", "source_id", "target_table", @@ -356,7 +356,7 @@ def test_extension_denormalize_creation(): ["omb", "text"], is_array=True, ) - query = templates.get_extension_denormalize_query(config) + query = base_templates.get_extension_denormalize_query(config) array_sql = """LOWER( ARRAY_JOIN( ARRAY_SORT( @@ -385,7 +385,7 @@ def test_insert_into_query_creation(): VALUES ('foo','foo'), ('bar','bar');""" - query = templates.get_insert_into_query( + query = base_templates.get_insert_into_query( table_name="test_table", table_cols=["a", "b"], dataset=[["foo", "foo"], ["bar", "bar"]], @@ -396,7 +396,7 @@ def test_insert_into_query_creation(): VALUES ('foo',VARCHAR 'foo'), ('bar',VARCHAR 'bar');""" - query = templates.get_insert_into_query( + query = base_templates.get_insert_into_query( table_name="test_table", table_cols=["a", "b"], dataset=[["foo", "foo"], ["bar", "bar"]], @@ -413,7 +413,7 @@ def test_is_table_not_empty(): WHERE field_name IS NOT NULL LIMIT 1;""" - query = templates.get_is_table_not_empty_query( + query = base_templates.get_is_table_not_empty_query( source_table="table_name", field="field_name" ) assert query == expected @@ -427,7 +427,7 @@ def test_is_table_not_empty(): WHERE field_name IS NOT NULL LIMIT 1;""" - query = templates.get_is_table_not_empty_query( + query = base_templates.get_is_table_not_empty_query( source_table="table_name", field="field_name", unnests=[ @@ -447,7 +447,7 @@ def test_is_table_not_empty(): AND field_name IS NOT NULL --noqa: LT02 LIMIT 1;""" - query = templates.get_is_table_not_empty_query( + query = base_templates.get_is_table_not_empty_query( source_table="table_name", field="field_name", conditions=["field_name LIKE 's%'", "field_name IS NOT NULL"], @@ -497,7 +497,7 @@ def test_get_code_system_pairs(): ) ) AS t (table_name, column_name, code, display, system)""" - query = templates.get_code_system_pairs( + query = base_templates.get_code_system_pairs( "output_table", [ {