Skip to content

Commit

Permalink
feat(deid): drop valueString instead of passing it through
Browse files Browse the repository at this point in the history
Previously, we would let Observation.valueString (and its component
form) through, optionally masking it with philter if you passed
--philter in.

But it held PHI too often, and if we ever will use the field,
it will likely be via ETL-side NLP. We don't need to keep the string
intact into Athena.

So now, we unconditionally drop Observation.valueString, replacing it
with a data-absent-reason extension marker (so consumers know it
_was_ present).
  • Loading branch information
mikix committed Nov 19, 2024
1 parent 468dcfb commit 741fb34
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 58 deletions.
4 changes: 2 additions & 2 deletions cumulus_etl/deid/ms-config.json
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@
{"path": "Observation.effective", "method": "keep"},
{"path": "Observation.issued", "method": "keep"},
{"path": "Observation.performer", "method": "keep"},
{"path": "Observation.value", "method": "keep"}, // should run philter on valueString
{"path": "Observation.value", "method": "keep"}, // we drop valueString inside ETL
{"path": "Observation.dataAbsentReason", "method": "keep"},
{"path": "Observation.interpretation", "method": "keep"},
// Skip Observation.note
Expand All @@ -363,7 +363,7 @@
{"path": "Observation.hasMember", "method": "keep"},
{"path": "Observation.derivedFrom", "method": "keep"},
{"path": "Observation.component.code", "method": "keep"},
{"path": "Observation.component.value", "method": "keep"}, // should run philter on valueString
{"path": "Observation.component.value", "method": "keep"}, // we drop valueString inside ETL
{"path": "Observation.component.dataAbsentReason", "method": "keep"},
{"path": "Observation.component.interpretation", "method": "keep"},
// Skip Observation.component.referenceRange
Expand Down
28 changes: 18 additions & 10 deletions cumulus_etl/deid/scrubber.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,23 +406,31 @@ def _check_text(self, key: str, value: Any) -> Any:
if isinstance(value, str) and any(
(
# Coding.display is clinically unnecessary but is useful for rendering.
# Since "code" is always present, downstream consumers can & should provide their own display label.
# But we don't remove it entirely, for cases where unexpected codes are used.
# Note that this will definitely over-scrub (like scrubbing "White" from the USCDI race extension),
# but again -- this display value is redundant and rather than try to be smart, we're safely dumb.
# Since "code" is always present, downstream consumers can & should provide their
# own display label. But we don't remove it entirely, for cases where unexpected
# codes are used. Note that this will definitely over-scrub (like scrubbing "White"
# from the USCDI race extension), but again -- this display value is redundant and
# rather than try to be smart, we're safely dumb.
key == "display",
# CodeableConcept.text has clinical value for situations that don't have clear coding yet.
# Think early-days Covid day PCRs. Which is why we let it through in the first place.
# CodeableConcept.text has clinical value for situations that don't have clear
# coding yet, like early-days Covid day PCRs. And text-only codeable concepts show
# up a lot when the EHR allows it. Hence why we normally let it through.
# But we should still scrub it since it is loose text that could hold PHI.
key == "text",
# Observation.valueString has clinical value, but could hold PHI.
# Similarly, extensions might have valueString members (though the only supported ones don't have
# interesting fields there -- race & ethnicity allow for freeform text descriptions).
key == "valueString",
)
):
value = self.scrub_text(value)

if isinstance(value, str) and any(
(
# Observation.valueString has some clinical value, but often holds PHI.
# If we ever want to use this for clinical purposes, we should process it via NLP
# on the ETL side (like we do for DocumentReference attachments).
key == "valueString",
)
):
raise MaskValue

return value

@staticmethod
Expand Down
1 change: 0 additions & 1 deletion docs/deid.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ There are some freeform text fields that Cumulus ETS asks the Microsoft Anonymiz
These fields are useful for presenting or computing a phenotype:
- `CodeableConcept.text`
- `Coding.display`
- `Observation.valueString` and `Observation.component.valueString`

Although Cumulus wants to largely preserve these fields,
they may contain PHI since they are freeform text fields after all.
Expand Down
13 changes: 6 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
name = "cumulus-etl"
requires-python = ">= 3.10"
# These dependencies are mostly pinned to be under the next major version.
# That makes particular sense as long as we don't have official releases yet and our code is used
# by pulling from main directly.
# But now there's a risk of missing a major release and bit-rotting the dependency tree.
# That makes particular sense as long as we don't have official releases yet and our docker
# images are built from main directly every commit.
#
# We should either (a) configure a bot to warn us about stale dependencies, or
# (b) once we switch to a more planned release schedule (via docker or similar), just go back to
# open-pinned dependencies so that we're more likely to notice new releases (we'll still have time
# to fix any breakages since users won't immediately see the problem).
# We mitigate the risk of missing a new release by having a bot watch for them.
#
# But if we ever start releasing on PyPI, we could switch to open-pinned dependencies to be
# less annoying to pip users.
dependencies = [
"ctakesclient >= 5.1, < 6",
"cumulus-fhir-support >= 1.2, < 2",
Expand Down
15 changes: 0 additions & 15 deletions tests/deid/test_deid_philter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,6 @@ def setUp(self):
{"Coding": {"display": "Patient 012-34-5678"}},
{"Coding": {"display": "Patient ***-**-****"}},
),
(
# philter catches the month for some reason, but correctly leaves the date numbers alone
{"resourceType": "Observation", "valueString": "Born on december 12 2012"},
{"resourceType": "Observation", "valueString": "Born on ******** 12 2012"},
),
(
{
"resourceType": "Observation",
"component": [{"valueString": "Contact at [email protected]"}],
},
{
"resourceType": "Observation",
"component": [{"valueString": "Contact at ***@***.***"}],
},
),
)
@ddt.unpack
def test_scrub_resource(self, resource, expected):
Expand Down
56 changes: 33 additions & 23 deletions tests/deid/test_deid_scrubber.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@
from cumulus_etl.deid.codebook import CodebookDB
from tests import i2b2_mock_data, utils

# Used in a few tests - just define once for cleanliness
MASKED_EXTENSION = {
"extension": [
{
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
"valueCode": "masked",
}
]
}


class TestScrubber(utils.AsyncTestCase):
"""Test case for the Scrubber class"""
Expand Down Expand Up @@ -103,22 +113,8 @@ def test_documentreference(self):
self.assertEqual(
docref["content"][0]["attachment"],
{
"_data": {
"extension": [
{
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
"valueCode": "masked",
}
]
},
"_url": {
"extension": [
{
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
"valueCode": "masked",
}
]
},
"_data": MASKED_EXTENSION,
"_url": MASKED_EXTENSION,
},
)
self.assertEqual(
Expand All @@ -132,14 +128,28 @@ def test_documentreference(self):
}
]
},
"_url": {
"extension": [
{
"url": "http://hl7.org/fhir/StructureDefinition/data-absent-reason",
"valueCode": "masked",
}
]
"_url": MASKED_EXTENSION,
},
)

def test_value_string_is_masked(self):
"""Verify that Observation.*.valueString is masked"""
obs = {
"resourceType": "Observation",
"valueString": "Hello Alice!",
"component": [
{
"valueString": "Also heyo Bob!",
},
],
}
self.assertTrue(Scrubber().scrub_resource(obs))
self.assertEqual(
obs,
{
"resourceType": "Observation",
"_valueString": MASKED_EXTENSION,
"component": [{"_valueString": MASKED_EXTENSION}],
},
)

Expand Down

0 comments on commit 741fb34

Please sign in to comment.