From 60209ac6e29710e2c3daee4bbd38f4d352bb7853 Mon Sep 17 00:00:00 2001 From: matt garber Date: Tue, 21 Jan 2025 12:56:40 -0500 Subject: [PATCH] Better duckdb schema, manifest/template entrypoint tweaks (#342) * Better duckdb schema, manifest/template entrypoint tweaks * PR feedback * cleanup of countsbuilder tests --- cumulus_library/__init__.py | 13 +++++++- cumulus_library/actions/builder.py | 6 +--- .../builders/base_table_builder.py | 2 +- cumulus_library/builders/counts.py | 30 ++++++++++++------- .../statistics_templates/counts_templates.py | 2 +- .../template_sql/base_templates.py | 9 +++++- .../template_sql/ctas_from_parquet.sql.jinja | 2 +- .../template_sql/insert_into.sql.jinja | 2 +- tests/test_actions.py | 2 +- tests/test_base_templates.py | 6 ++-- tests/test_counts_builder.py | 15 ++-------- .../study_python_counts_valid/module1.py | 3 -- .../study_python_counts_valid/module2.py | 3 -- 13 files changed, 51 insertions(+), 44 deletions(-) diff --git a/cumulus_library/__init__.py b/cumulus_library/__init__.py index 7f7ca9a9..542bf51b 100644 --- a/cumulus_library/__init__.py +++ b/cumulus_library/__init__.py @@ -4,6 +4,17 @@ from cumulus_library.builders.base_table_builder import BaseTableBuilder from cumulus_library.builders.counts import CountsBuilder from cumulus_library.study_manifest import StudyManifest +from cumulus_library.template_sql.base_templates import get_template -__all__ = ["BaseTableBuilder", "CountsBuilder", "StudyConfig", "StudyManifest"] +# A note about the `get_template` function: +# As of this writing, the get_template function is used internally by the +# Cumulus team across multiple study repos for creating SQL in databases, both +# from the templates in this repo and in the various other projects (while leveraging +# some of our syntax helper macros, which are auto loaded by this function). +# Breaking changes to these templates will result in a major version bump. + +# This API should be usable for your own study efforts - but the documentation +# is all code level. See template_sql for more information. + +__all__ = ["BaseTableBuilder", "CountsBuilder", "StudyConfig", "StudyManifest", "get_template"] __version__ = "4.2.0" diff --git a/cumulus_library/actions/builder.py b/cumulus_library/actions/builder.py index 39d874bc..bc6e17f4 100644 --- a/cumulus_library/actions/builder.py +++ b/cumulus_library/actions/builder.py @@ -13,7 +13,6 @@ from cumulus_library import ( BaseTableBuilder, - CountsBuilder, base_utils, databases, enums, @@ -97,10 +96,7 @@ def _load_and_execute_builder( # remove it so it doesn't interfere with the next python module to # execute, since the subclass would otherwise hang around. table_builder_class = table_builder_subclasses[0] - if issubclass(table_builder_class, CountsBuilder): - table_builder = table_builder_class(study_prefix=manifest.get_study_prefix()) - else: - table_builder = table_builder_class() + table_builder = table_builder_class(manifest=manifest) if write_reference_sql: prefix = manifest.get_study_prefix() table_builder.prepare_queries(config=config, manifest=manifest, parser=db_parser) diff --git a/cumulus_library/builders/base_table_builder.py b/cumulus_library/builders/base_table_builder.py index 5f15ac5a..ee943938 100644 --- a/cumulus_library/builders/base_table_builder.py +++ b/cumulus_library/builders/base_table_builder.py @@ -18,7 +18,7 @@ class BaseTableBuilder(abc.ABC): display_text = "Building custom tables..." - def __init__(self): + def __init__(self, manifest: study_manifest.StudyManifest | None = None): self.queries = [] @abc.abstractmethod diff --git a/cumulus_library/builders/counts.py b/cumulus_library/builders/counts.py index 0ab617d9..82a538ac 100644 --- a/cumulus_library/builders/counts.py +++ b/cumulus_library/builders/counts.py @@ -2,6 +2,8 @@ from pathlib import Path +from rich.console import Console + from cumulus_library import BaseTableBuilder, errors, study_manifest from cumulus_library.builders.statistics_templates import counts_templates @@ -12,9 +14,23 @@ class CountsBuilder(BaseTableBuilder): """Extends BaseTableBuilder for counts-related use cases""" - def __init__(self, study_prefix: str | None = None): + def __init__( + self, study_prefix: str | None = None, manifest: study_manifest.StudyManifest | None = None + ): super().__init__() - self.study_prefix = study_prefix + if manifest: + self.study_prefix = manifest.get_study_prefix() + elif study_prefix: + console = Console() + console.print( + "[yellow]Warning: providing study_prefix to a CountsBuilder is deprecated" + " and will be removed in a future version" + ) + self.study_prefix = study_prefix + else: + raise errors.CountsBuilderError( + "CountsBuilder should be initiated with a valid manifest.toml" + ) def get_table_name(self, table_name: str, duration=None) -> str: """Convenience method for constructing table name @@ -22,11 +38,6 @@ def get_table_name(self, table_name: str, duration=None) -> str: :param table_name: table name to add after the study prefix :param duration: a time period reflecting the table binning strategy """ - if not self.study_prefix: - raise errors.CountsBuilderError( - "CountsBuilder must be either initiated with a study prefix, " - "or be in a directory with a valid manifest.toml" - ) if duration: return f"{self.study_prefix}__{table_name}_{duration}" else: @@ -64,7 +75,7 @@ def get_count_query( :keyword min_subject: An integer setting the minimum bin size for inclusion (default: 10) :keyword fhir_resource: The type of FHIR resource to count (see - statistics/statistics_templates/count_templates.CountableFhirResource) + builders/statistics_templates/count_templates.CountableFhirResource) """ if not table_name or not source_table or not table_cols: raise errors.CountsBuilderError( @@ -345,5 +356,4 @@ def prepare_queries( This should be overridden in any count generator. See studies/core/count_core.py for an example """ - if manifest and not self.study_prefix: - self.study_prefix = manifest.get_study_prefix() + pass diff --git a/cumulus_library/builders/statistics_templates/counts_templates.py b/cumulus_library/builders/statistics_templates/counts_templates.py index 1751c520..3926eda3 100644 --- a/cumulus_library/builders/statistics_templates/counts_templates.py +++ b/cumulus_library/builders/statistics_templates/counts_templates.py @@ -61,7 +61,7 @@ def get_count_query( table_col_classed.append(CountColumn(name=item, db_type="varchar", alias=None)) table_cols = table_col_classed - query = base_templates.get_base_template( + query = base_templates.get_template( "count", path, table_name=table_name, diff --git a/cumulus_library/template_sql/base_templates.py b/cumulus_library/template_sql/base_templates.py index df5daaf7..954e148f 100644 --- a/cumulus_library/template_sql/base_templates.py +++ b/cumulus_library/template_sql/base_templates.py @@ -17,7 +17,14 @@ class TableView(enum.Enum): VIEW = "VIEW" -def get_base_template(filename_stem: str, path: pathlib.Path | None = None, **kwargs: dict) -> str: +def get_base_template(*args, **kwargs): + """Legacy wrapper for get_template""" + # TODO: Isolate this as sole library entrypoint after cutting over studies to use + # the public get_template + return get_template(*args, **kwargs) + + +def get_template(filename_stem: str, path: pathlib.Path | None = None, **kwargs: dict) -> str: """Abstract renderer for jinja templates You can use this renderer directly, but if you are designing a function diff --git a/cumulus_library/template_sql/ctas_from_parquet.sql.jinja b/cumulus_library/template_sql/ctas_from_parquet.sql.jinja index 5423ee45..76b671b0 100644 --- a/cumulus_library/template_sql/ctas_from_parquet.sql.jinja +++ b/cumulus_library/template_sql/ctas_from_parquet.sql.jinja @@ -2,7 +2,7 @@ {%- if db_type == 'athena' -%} CREATE EXTERNAL TABLE IF NOT EXISTS `{{ schema_name }}`.`{{ table_name }}` ( {%- elif db_type == 'duckdb' -%} -CREATE TABLE IF NOT EXISTS "{{ table_name }}" AS SELECT +CREATE TABLE IF NOT EXISTS "{{schema_name}}"."{{ table_name }}" AS SELECT {%- endif %} {%- for col in table_cols %} {%- if db_type == 'athena' %} {{ col }} {{ remote_table_cols_types[loop.index0] }} diff --git a/cumulus_library/template_sql/insert_into.sql.jinja b/cumulus_library/template_sql/insert_into.sql.jinja index 214bf302..38547cbc 100644 --- a/cumulus_library/template_sql/insert_into.sql.jinja +++ b/cumulus_library/template_sql/insert_into.sql.jinja @@ -1,5 +1,5 @@ {%- import 'syntax.sql.jinja' as syntax -%} -INSERT INTO {{ schema_name }}.{{ table_name }} +INSERT INTO "{{ schema_name }}"."{{ table_name }}" ( {%- for col in table_cols -%} "{{ col }}" diff --git a/tests/test_actions.py b/tests/test_actions.py index 49d0a501..c87dfca1 100644 --- a/tests/test_actions.py +++ b/tests/test_actions.py @@ -399,7 +399,7 @@ def test_import_study(tmp_path, mock_db_config): archive.write(tmp_path / "archive/test__table.csv") (tmp_path / "archive/test__table.parquet").unlink() (tmp_path / "archive/test__table.csv").unlink() - mock_db_config.schema = "schema_name" + mock_db_config.schema = "main" importer.import_archive(config=mock_db_config, archive_path=tmp_path / "archive/test.zip") assert len(list((tmp_path / "archive").glob("*"))) == 1 test_table = mock_db_config.db.cursor().execute("SELECT * FROM test__table").fetchall() diff --git a/tests/test_base_templates.py b/tests/test_base_templates.py index 11e42272..8629d7f1 100644 --- a/tests/test_base_templates.py +++ b/tests/test_base_templates.py @@ -557,7 +557,7 @@ def test_ctas_empty_query_creation(expected, schema, table, cols, types): ["String", "Int"], ), ( - """CREATE TABLE IF NOT EXISTS "local_table" AS SELECT "a", "b" + """CREATE TABLE IF NOT EXISTS "test_duckdb"."local_table" AS SELECT "a", "b" FROM read_parquet('./tests/test_data/*.parquet')""", "duckdb", "test_duckdb", @@ -728,7 +728,7 @@ def test_extension_denormalize_creation(): def test_insert_into_query_creation(): - expected = """INSERT INTO test.test_table + expected = """INSERT INTO "test"."test_table" ("a","b") VALUES ('foo','foo'), @@ -740,7 +740,7 @@ def test_insert_into_query_creation(): dataset=[["foo", "foo"], ["bar", "bar"]], ) assert query == expected - expected = """INSERT INTO test.test_table + expected = """INSERT INTO "test"."test_table" ("a","b") VALUES ('foo',VARCHAR 'foo'), diff --git a/tests/test_counts_builder.py b/tests/test_counts_builder.py index 676c9558..8f5f2c65 100644 --- a/tests/test_counts_builder.py +++ b/tests/test_counts_builder.py @@ -1,12 +1,11 @@ """tests for outputs of counts_builder module""" -import pathlib from contextlib import nullcontext as does_not_raise from unittest import mock import pytest -from cumulus_library import StudyManifest, errors +from cumulus_library import errors from cumulus_library.builders import counts TEST_PREFIX = "test" @@ -201,18 +200,8 @@ def test_count_wrappers( def test_null_study_prefix(): - builder = counts.CountsBuilder() with pytest.raises(errors.CountsBuilderError): - builder.get_table_name("table") # needs study_prefix to work - - -def test_implicit_study_prefix(): - builder = counts.CountsBuilder() - manifest = StudyManifest(pathlib.Path("./tests/test_data/study_python_valid")) - - assert builder.study_prefix is None - builder.prepare_queries(manifest=manifest) - assert builder.study_prefix == "study_python_valid" + counts.CountsBuilder() def test_write_queries(tmp_path): diff --git a/tests/test_data/study_python_counts_valid/module1.py b/tests/test_data/study_python_counts_valid/module1.py index 0871e908..37223e8f 100644 --- a/tests/test_data/study_python_counts_valid/module1.py +++ b/tests/test_data/study_python_counts_valid/module1.py @@ -4,9 +4,6 @@ class ModuleOneRunner(cumulus_library.CountsBuilder): display_text = "module1" - def __init__(self, study_prefix): - super().__init__(study_prefix="study_python_counts_valid") - def prepare_queries(self, *args, **kwargs): self.queries.append( "CREATE TABLE IF NOT EXISTS study_python_counts_valid__table1 (test int);" diff --git a/tests/test_data/study_python_counts_valid/module2.py b/tests/test_data/study_python_counts_valid/module2.py index 9d96f130..c78feda8 100644 --- a/tests/test_data/study_python_counts_valid/module2.py +++ b/tests/test_data/study_python_counts_valid/module2.py @@ -4,9 +4,6 @@ class ModuleTwoRunner(cumulus_library.CountsBuilder): display_text = "module2" - def __init__(self, study_prefix): - super().__init__(study_prefix="study_python_counts_valid") - def prepare_queries(self, *args, **kwargs): self.queries.append( "CREATE TABLE IF NOT EXISTS study_python_counts_valid__table2 (test int);"