Skip to content

Commit

Permalink
Counts generation peformance improvements (#171)
Browse files Browse the repository at this point in the history
* Counts generation peformance improvements

* regression/template test tweak

* PR feedback

* actually use the util function
  • Loading branch information
dogversioning authored Feb 2, 2024
1 parent 820e1a9 commit c8033bb
Show file tree
Hide file tree
Showing 35 changed files with 12,152 additions and 11,757 deletions.
11 changes: 6 additions & 5 deletions cumulus_library/base_table_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing import final

from cumulus_library.databases import DatabaseCursor
from cumulus_library.helper import get_progress_bar, query_console_output
from cumulus_library import helper


class BaseTableBuilder(ABC):
Expand Down Expand Up @@ -77,18 +77,19 @@ def execute_queries(
table_names.append(table_name)
for table_name in table_names:
cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
with get_progress_bar(disable=verbose) as progress:
with helper.get_progress_bar(disable=verbose) as progress:
task = progress.add_task(
self.display_text,
total=len(self.queries),
visible=not verbose,
)
for query in self.queries:
query_console_output(verbose, query, progress, task)
try:
cursor.execute(query)
with helper.query_console_output(verbose, query, progress, task):
cursor.execute(query)
except Exception as e: # pylint: disable=broad-exception-caught
sys.exit(e)

self.post_execution(cursor, schema, verbose, drop_table, *args, **kwargs)

def post_execution(
Expand Down Expand Up @@ -120,8 +121,8 @@ def comment_queries(self, doc_str=None):
self.queries = commented_queries

def write_queries(self, path: pathlib.Path = pathlib.Path.cwd() / "output.sql"):
path.parents[0].mkdir(parents=True, exist_ok=True)
"""writes all queries constructed by prepare_queries to disk"""
path.parents[0].mkdir(parents=True, exist_ok=True)
with open(path, "w", encoding="utf-8") as file:
for query in self.queries:
file.write(query)
Expand Down
8 changes: 2 additions & 6 deletions cumulus_library/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,12 +355,8 @@ def run_cli(args: Dict):
runner.export_study(study_dict[target], args["data_path"])

elif args["action"] == "generate-sql":
if "all" in args["target"]:
for target in study_dict.keys():
runner.generate_all_sql(study_dict[target])
else:
for target in args["target"]:
runner.generate_study_sql(study_dict[target])
for target in args["target"]:
runner.generate_study_sql(study_dict[target])
finally:
db_backend.close()

Expand Down
25 changes: 25 additions & 0 deletions cumulus_library/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,26 @@ def __init__(self, db_file: str):
None,
duckdb.typing.TIMESTAMP,
)
self.connection.create_function(
# When trying to calculate an MD5 hash in Trino/Athena, the implementation
# expects to recieve a varbinary type, so if you're hashing a string,
# you would invoke it like `SELECT md5(to_utf8(string_col)) from table`.
#
# DuckDB's md5() function accepts a varchar instead, and does not have a
# to_utf() function or varbinary type, so we patch this with a UDF that
# just provides back the original string. As a result, these functions
# have different signatures, but for cases like this where you're
# conforming an argument to another function, it provides appropriate
# function mocking
#
# NOTE: currently we do not have a use case beyond experimentation where
# using MD5 hashes provide a benefit. Until we do, it is not required to
# support this in other DatabaseBackend implementations.
"to_utf8",
self._compat_to_utf8,
None,
duckdb.typing.VARCHAR,
)

def insert_tables(self, tables: dict[str, pyarrow.Table]) -> None:
"""Ingests all ndjson data from a folder tree (often the output folder of Cumulus ETL)"""
Expand Down Expand Up @@ -245,6 +265,11 @@ def _compat_date(
else:
raise ValueError("Unexpected date() argument:", type(value), value)

@staticmethod
def _compat_to_utf8(value: Optional[str]) -> Optional[datetime.date]:
"""See the create_function() call for to_utf8 for more background"""
return value

@staticmethod
def _compat_from_iso8601_timestamp(
value: Optional[str],
Expand Down
9 changes: 7 additions & 2 deletions cumulus_library/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import datetime
import os
import json

from contextlib import contextmanager
from typing import List

from rich import progress


Expand Down Expand Up @@ -48,14 +51,16 @@ def list_coding(code_display: dict, system=None) -> List[dict]:
return as_list


@contextmanager
def query_console_output(
verbose: bool, query: str, progress_bar: progress.Progress, task: progress.Task
):
"""Convenience function for determining output type"""
"""Convenience context manager for handling console output"""
if verbose:
print()
print(query)
else:
yield
if not verbose:
progress_bar.advance(task)


Expand Down
12 changes: 5 additions & 7 deletions cumulus_library/studies/core/builder_medication.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
""" Module for generating core medication table"""

from cumulus_library.base_table_builder import BaseTableBuilder
from cumulus_library.helper import get_progress_bar, query_console_output
from cumulus_library.template_sql import templates
from cumulus_library.template_sql.utils import is_codeable_concept_populated
from cumulus_library import base_table_builder, helper
from cumulus_library.template_sql import templates, utils
from cumulus_library.studies.core.core_templates import core_templates


class MedicationBuilder(BaseTableBuilder):
class MedicationBuilder(base_table_builder.BaseTableBuilder):
display_text = "Creating Medication table..."

def _check_data_in_fields(self, cursor, schema: str):
Expand All @@ -27,14 +25,14 @@ def _check_data_in_fields(self, cursor, schema: str):

table = "medicationrequest"
base_col = "medicationcodeableconcept"
with get_progress_bar(transient=True) as progress:
with helper.get_progress_bar(transient=True) as progress:
task = progress.add_task(
"Detecting available medication sources...",
total=7,
)

# inline medications from FHIR medication
data_types["inline"] = is_codeable_concept_populated(
data_types["inline"] = utils.is_codeable_concept_populated(
schema, table, base_col, cursor
)
if data_types["inline"]:
Expand Down
1 change: 0 additions & 1 deletion cumulus_library/studies/core/builder_prereq_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import sqlparse

from cumulus_library import base_table_builder
from cumulus_library.helper import get_progress_bar, query_console_output


class CorePrereqTableBuilder(base_table_builder.BaseTableBuilder):
Expand Down
1 change: 1 addition & 0 deletions cumulus_library/studies/core/count_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def count_core_condition(self, duration: str = "month"):
cols = [
["category_code", "varchar", "cond_category_code"],
[f"recorded_{duration}", "date", "cond_month"],
["code_display", "varchar", "cond_code_display"],
]
return self.count_condition(table_name, from_table, cols)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__condition_codable_concepts_display AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__documentreference_dn_type AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__encounter_dn_type AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__medication AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__medicationrequest_dn_category AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__observation_dn_category AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__patient_ext_race AS (
WITH

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
-- Its format is tied to the specific database it was run against, and it may not
-- be correct for all databases. Use the CLI's build option to derive the best SQL
-- for your dataset.

-- ###########################################################

CREATE TABLE core__meta_version AS
SELECT 3 AS data_package_version;

Expand Down
Loading

0 comments on commit c8033bb

Please sign in to comment.