Merge pull request #291 from smart-on-fhir/mikix/fhir-support

feat: use the new cumulus-fhir-support module for schemas
smart-on-fhir · Nov 22, 2023 · 38497d9 · 38497d9
2 parents cfeb3cb + f3ed99b
commit 38497d9
Show file tree

Hide file tree

Showing 5 changed files with 11 additions and 333 deletions.
diff --git a/cumulus_etl/etl/tasks/base.py b/cumulus_etl/etl/tasks/base.py
@@ -5,13 +5,14 @@
 import os
 from collections.abc import AsyncIterator, Iterator
 
+import cumulus_fhir_support
 import pyarrow
 import rich.live
 import rich.progress
 import rich.table
 import rich.text
 
-from cumulus_etl import cli_utils, common, deid, fhir, formats, store
+from cumulus_etl import cli_utils, common, deid, formats, store
 from cumulus_etl.etl import config
 from cumulus_etl.etl.tasks import batching
 
@@ -357,5 +358,5 @@ def get_schema(cls, formatter: formats.Format, rows: list[dict]) -> pyarrow.Sche
         Can be overridden as needed for non-FHIR outputs.
         """
         if formatter.resource_type:
-            return fhir.pyarrow_schema_from_resource_batch(formatter.resource_type, rows)
+            return cumulus_fhir_support.pyarrow_schema_from_rows(formatter.resource_type, rows)
         return None
diff --git a/cumulus_etl/fhir/__init__.py b/cumulus_etl/fhir/__init__.py
@@ -1,5 +1,4 @@
 """Support for talking to FHIR servers & handling the FHIR spec"""
 
 from .fhir_client import FhirClient, create_fhir_client_for_cli
-from .fhir_schemas import pyarrow_schema_from_resource_batch
 from .fhir_utils import download_reference, get_docref_note, ref_resource, unref_resource
diff --git a/cumulus_etl/fhir/fhir_schemas.py b/cumulus_etl/fhir/fhir_schemas.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,15 +12,15 @@ requires-python = ">= 3.10"
 # to fix any breakages since users won't immediately see the problem).
 dependencies = [
     "ctakesclient >= 5.1, < 6",
+    "cumulus-fhir-support >= 1, < 2",
     "delta-spark >= 3, < 4",
-    "fhirclient < 5",
     "httpx < 1",
     "inscriptis < 3",
     "jwcrypto < 2",
     "label-studio-sdk < 1",
     "oracledb < 2",
     "philter-lite < 1",
-    "pyarrow < 14",
+    "pyarrow < 15",
     "rich < 14",
     "s3fs",
 ]

diff --git a/tests/etl/test_tasks.py b/tests/etl/test_tasks.py
@@ -4,7 +4,6 @@
 from unittest import mock
 
 import ddt
-import pyarrow
 
 from cumulus_etl import common, errors
 from cumulus_etl.etl.tasks import basic_tasks, task_factory
@@ -113,165 +112,15 @@ async def test_batch_write_errors_saved(self):
             common.read_json(f"{self.errors_dir}/patient/write-error.002.ndjson"),
         )
 
-    async def test_batch_has_wide_schema(self):
-        self.make_json("Patient.1", "A")  # no interesting fields
-
+    async def test_batch_is_given_schema(self):
+        """Verify that we calculate a schema for a batch"""
+        self.make_json("Patient.1", "A")
         await basic_tasks.PatientTask(self.job_config, self.scrubber).run()
 
+        # Spot check that the schema (from cumulus-fhir-support) exists / looks right
         schema = self.format.write_records.call_args[0][0].schema
-        self.assertListEqual(
-            [
-                "resourceType",
-                "id",
-                "implicitRules",
-                "language",
-                "meta",
-                "contained",
-                "extension",
-                "modifierExtension",
-                "text",
-                "active",
-                "address",
-                "birthDate",
-                "communication",
-                "contact",
-                "deceasedBoolean",
-                "deceasedDateTime",
-                "gender",
-                "generalPractitioner",
-                "identifier",
-                "link",
-                "managingOrganization",
-                "maritalStatus",
-                "multipleBirthBoolean",
-                "multipleBirthInteger",
-                "name",
-                "photo",
-                "telecom",
-            ],
-            schema.names,
-        )
-
-        # Spot check a few of the types
-        self.assertEqual(pyarrow.string(), schema.field("id").type)
-        self.assertEqual(pyarrow.bool_(), schema.field("deceasedBoolean").type)
-        self.assertEqual(pyarrow.int32(), schema.field("multipleBirthInteger").type)
-        # Note how struct types only have basic types inside of them - this is intentional, no recursion of structs
-        # is done by the ETL.
-        self.assertEqual(
-            pyarrow.struct({"id": pyarrow.string(), "div": pyarrow.string(), "status": pyarrow.string()}),
-            schema.field("text").type,
-        )
-        self.assertEqual(
-            pyarrow.list_(pyarrow.struct({"id": pyarrow.string(), "preferred": pyarrow.bool_()})),
-            schema.field("communication").type,
-        )
-
-    async def test_batch_schema_includes_inferred_fields(self):
-        """Verify that deep (inferred) fields are also included in the final schema"""
-        # Make sure that we include different deep fields for each - final schema should be a union
-        self.make_json("Condition.1", "A", stage=[{"type": {"coding": [{"version": "1.0"}]}}])
-        self.make_json("Condition.2", "B", onsetRange={"low": {"value": 1.0}})
-
-        await basic_tasks.ConditionTask(self.job_config, self.scrubber).run()
-
-        schema = self.format.write_records.call_args[0][0].schema
-
-        # Start with simple, non-present CodeableConcept at level zero -- this should be fully described
-        self.assertEqual(
-            pyarrow.struct(
-                {
-                    "id": pyarrow.string(),
-                    "coding": pyarrow.list_(
-                        pyarrow.struct(
-                            {
-                                "id": pyarrow.string(),
-                                "code": pyarrow.string(),
-                                "display": pyarrow.string(),
-                                "system": pyarrow.string(),
-                                "userSelected": pyarrow.bool_(),
-                                "version": pyarrow.string(),
-                            }
-                        )
-                    ),
-                    "text": pyarrow.string(),
-                }
-            ),
-            schema.field("code").type,  # CodeableConcept type
-        )
-        # While a deeper non-present CodeableConcept should be ignored
-        self.assertEqual(
-            pyarrow.list_(
-                pyarrow.struct(
-                    {
-                        "id": pyarrow.string(),
-                        # "code" field is missing (CodeableConcept type)
-                        # "detail" field is missing (Reference type)
-                    }
-                )
-            ),
-            schema.field("evidence").type,  # BackboneElement type
-        )
-        # But if any piece of a deep CodeableConcept is present, it gets fully expanded.
-        self.assertEqual(
-            pyarrow.list_(
-                pyarrow.struct(
-                    {
-                        "id": pyarrow.string(),
-                        # "assessment" field is missing (Reference type)
-                        # "summary" field is missing (CodeableConcept type)
-                        # But the "type" is here in full because a piece of it was in the input
-                        "type": pyarrow.struct(
-                            {
-                                "id": pyarrow.string(),
-                                "coding": pyarrow.list_(
-                                    pyarrow.struct(
-                                        {
-                                            "id": pyarrow.string(),
-                                            "code": pyarrow.string(),
-                                            "display": pyarrow.string(),
-                                            "system": pyarrow.string(),
-                                            "userSelected": pyarrow.bool_(),
-                                            "version": pyarrow.string(),
-                                        }
-                                    )
-                                ),
-                                "text": pyarrow.string(),
-                            }
-                        ),
-                    }
-                )
-            ),
-            schema.field("stage").type,  # BackboneElement type
-        )
-        # Other deep-and-partial elements do not get the same expansion treatment.
-        # Here is a deep Quantity element.
-        # The parts present in the input are also in the schema, but only those parts.
-        self.assertEqual(
-            pyarrow.struct(
-                {
-                    "id": pyarrow.string(),
-                    "low": pyarrow.struct(
-                        {
-                            "value": pyarrow.float64(),
-                        }
-                    ),
-                }
-            ),
-            schema.field("onsetRange").type,
-        )
-
-    async def test_batch_schema_types_are_coerced(self):
-        """Verify that fields with "wrong" input types (like int instead of float) are correct in final schema"""
-        # Make sure that we include both wide and deep fields - we should coerce both into FHIR spec schema
-        self.make_json("ServiceRequest.1", "A", quantityQuantity={"value": 1})  # should be floating type
-        self.make_json("ServiceRequest.2", "B", quantityRange={"low": {"value": 2}})  # should be floating type
-
-        await basic_tasks.ServiceRequestTask(self.job_config, self.scrubber).run()
-
-        schema = self.format.write_records.call_args[0][0].schema
-        self.assertEqual(pyarrow.float64(), schema.field("quantityQuantity").type.field("value").type)
-        self.assertEqual(pyarrow.float64(), schema.field("quantityRange").type.field("low").type.field("value").type)
+        self.assertIn("address", schema.names)
+        self.assertIn("id", schema.names)
 
 
 @ddt.ddt