From 741fb34678425759df58b91e6ba4da433f74e509 Mon Sep 17 00:00:00 2001 From: Michael Terry Date: Tue, 19 Nov 2024 16:02:54 -0500 Subject: [PATCH] feat(deid): drop valueString instead of passing it through Previously, we would let Observation.valueString (and its component form) through, optionally masking it with philter if you passed --philter in. But it held PHI too often, and if we ever will use the field, it will likely be via ETL-side NLP. We don't need to keep the string intact into Athena. So now, we unconditionally drop Observation.valueString, replacing it with a data-absent-reason extension marker (so consumers know it _was_ present). --- cumulus_etl/deid/ms-config.json | 4 +-- cumulus_etl/deid/scrubber.py | 28 ++++++++++------ docs/deid.md | 1 - pyproject.toml | 13 ++++---- tests/deid/test_deid_philter.py | 15 --------- tests/deid/test_deid_scrubber.py | 56 +++++++++++++++++++------------- 6 files changed, 59 insertions(+), 58 deletions(-) diff --git a/cumulus_etl/deid/ms-config.json b/cumulus_etl/deid/ms-config.json index 166a26f..e88dcac 100644 --- a/cumulus_etl/deid/ms-config.json +++ b/cumulus_etl/deid/ms-config.json @@ -351,7 +351,7 @@ {"path": "Observation.effective", "method": "keep"}, {"path": "Observation.issued", "method": "keep"}, {"path": "Observation.performer", "method": "keep"}, - {"path": "Observation.value", "method": "keep"}, // should run philter on valueString + {"path": "Observation.value", "method": "keep"}, // we drop valueString inside ETL {"path": "Observation.dataAbsentReason", "method": "keep"}, {"path": "Observation.interpretation", "method": "keep"}, // Skip Observation.note @@ -363,7 +363,7 @@ {"path": "Observation.hasMember", "method": "keep"}, {"path": "Observation.derivedFrom", "method": "keep"}, {"path": "Observation.component.code", "method": "keep"}, - {"path": "Observation.component.value", "method": "keep"}, // should run philter on valueString + {"path": "Observation.component.value", "method": "keep"}, // we drop valueString inside ETL {"path": "Observation.component.dataAbsentReason", "method": "keep"}, {"path": "Observation.component.interpretation", "method": "keep"}, // Skip Observation.component.referenceRange diff --git a/cumulus_etl/deid/scrubber.py b/cumulus_etl/deid/scrubber.py index 5166a99..cfb9e0b 100644 --- a/cumulus_etl/deid/scrubber.py +++ b/cumulus_etl/deid/scrubber.py @@ -406,23 +406,31 @@ def _check_text(self, key: str, value: Any) -> Any: if isinstance(value, str) and any( ( # Coding.display is clinically unnecessary but is useful for rendering. - # Since "code" is always present, downstream consumers can & should provide their own display label. - # But we don't remove it entirely, for cases where unexpected codes are used. - # Note that this will definitely over-scrub (like scrubbing "White" from the USCDI race extension), - # but again -- this display value is redundant and rather than try to be smart, we're safely dumb. + # Since "code" is always present, downstream consumers can & should provide their + # own display label. But we don't remove it entirely, for cases where unexpected + # codes are used. Note that this will definitely over-scrub (like scrubbing "White" + # from the USCDI race extension), but again -- this display value is redundant and + # rather than try to be smart, we're safely dumb. key == "display", - # CodeableConcept.text has clinical value for situations that don't have clear coding yet. - # Think early-days Covid day PCRs. Which is why we let it through in the first place. + # CodeableConcept.text has clinical value for situations that don't have clear + # coding yet, like early-days Covid day PCRs. And text-only codeable concepts show + # up a lot when the EHR allows it. Hence why we normally let it through. # But we should still scrub it since it is loose text that could hold PHI. key == "text", - # Observation.valueString has clinical value, but could hold PHI. - # Similarly, extensions might have valueString members (though the only supported ones don't have - # interesting fields there -- race & ethnicity allow for freeform text descriptions). - key == "valueString", ) ): value = self.scrub_text(value) + if isinstance(value, str) and any( + ( + # Observation.valueString has some clinical value, but often holds PHI. + # If we ever want to use this for clinical purposes, we should process it via NLP + # on the ETL side (like we do for DocumentReference attachments). + key == "valueString", + ) + ): + raise MaskValue + return value @staticmethod diff --git a/docs/deid.md b/docs/deid.md index 20f3877..6040f03 100644 --- a/docs/deid.md +++ b/docs/deid.md @@ -155,7 +155,6 @@ There are some freeform text fields that Cumulus ETS asks the Microsoft Anonymiz These fields are useful for presenting or computing a phenotype: - `CodeableConcept.text` - `Coding.display` -- `Observation.valueString` and `Observation.component.valueString` Although Cumulus wants to largely preserve these fields, they may contain PHI since they are freeform text fields after all. diff --git a/pyproject.toml b/pyproject.toml index aa5e2c4..2f570ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,14 +2,13 @@ name = "cumulus-etl" requires-python = ">= 3.10" # These dependencies are mostly pinned to be under the next major version. -# That makes particular sense as long as we don't have official releases yet and our code is used -# by pulling from main directly. -# But now there's a risk of missing a major release and bit-rotting the dependency tree. +# That makes particular sense as long as we don't have official releases yet and our docker +# images are built from main directly every commit. # -# We should either (a) configure a bot to warn us about stale dependencies, or -# (b) once we switch to a more planned release schedule (via docker or similar), just go back to -# open-pinned dependencies so that we're more likely to notice new releases (we'll still have time -# to fix any breakages since users won't immediately see the problem). +# We mitigate the risk of missing a new release by having a bot watch for them. +# +# But if we ever start releasing on PyPI, we could switch to open-pinned dependencies to be +# less annoying to pip users. dependencies = [ "ctakesclient >= 5.1, < 6", "cumulus-fhir-support >= 1.2, < 2", diff --git a/tests/deid/test_deid_philter.py b/tests/deid/test_deid_philter.py index a9ea1a7..6086417 100644 --- a/tests/deid/test_deid_philter.py +++ b/tests/deid/test_deid_philter.py @@ -23,21 +23,6 @@ def setUp(self): {"Coding": {"display": "Patient 012-34-5678"}}, {"Coding": {"display": "Patient ***-**-****"}}, ), - ( - # philter catches the month for some reason, but correctly leaves the date numbers alone - {"resourceType": "Observation", "valueString": "Born on december 12 2012"}, - {"resourceType": "Observation", "valueString": "Born on ******** 12 2012"}, - ), - ( - { - "resourceType": "Observation", - "component": [{"valueString": "Contact at foo@bar.com"}], - }, - { - "resourceType": "Observation", - "component": [{"valueString": "Contact at ***@***.***"}], - }, - ), ) @ddt.unpack def test_scrub_resource(self, resource, expected): diff --git a/tests/deid/test_deid_scrubber.py b/tests/deid/test_deid_scrubber.py index 5ab881a..c26d46c 100644 --- a/tests/deid/test_deid_scrubber.py +++ b/tests/deid/test_deid_scrubber.py @@ -10,6 +10,16 @@ from cumulus_etl.deid.codebook import CodebookDB from tests import i2b2_mock_data, utils +# Used in a few tests - just define once for cleanliness +MASKED_EXTENSION = { + "extension": [ + { + "url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", + "valueCode": "masked", + } + ] +} + class TestScrubber(utils.AsyncTestCase): """Test case for the Scrubber class""" @@ -103,22 +113,8 @@ def test_documentreference(self): self.assertEqual( docref["content"][0]["attachment"], { - "_data": { - "extension": [ - { - "url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", - "valueCode": "masked", - } - ] - }, - "_url": { - "extension": [ - { - "url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", - "valueCode": "masked", - } - ] - }, + "_data": MASKED_EXTENSION, + "_url": MASKED_EXTENSION, }, ) self.assertEqual( @@ -132,14 +128,28 @@ def test_documentreference(self): } ] }, - "_url": { - "extension": [ - { - "url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason", - "valueCode": "masked", - } - ] + "_url": MASKED_EXTENSION, + }, + ) + + def test_value_string_is_masked(self): + """Verify that Observation.*.valueString is masked""" + obs = { + "resourceType": "Observation", + "valueString": "Hello Alice!", + "component": [ + { + "valueString": "Also heyo Bob!", }, + ], + } + self.assertTrue(Scrubber().scrub_resource(obs)) + self.assertEqual( + obs, + { + "resourceType": "Observation", + "_valueString": MASKED_EXTENSION, + "component": [{"_valueString": MASKED_EXTENSION}], }, )