diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b45f16b6..a9132aac 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -69,8 +69,11 @@ jobs: pip install bandit[toml] pycodestyle pylint black==22.12.0 - name: Run pycodestyle + # E203: pycodestyle is a little too rigid about slices & whitespace + # See https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#slices + # W503: a default ignore that we are restoring run: | - pycodestyle --max-line-length=120 . + pycodestyle --max-line-length=120 --ignore=E203,W503 . - name: Run pylint if: success() || failure() # still run pylint if above checks fail diff --git a/compose.yaml b/compose.yaml index e598087a..ff6038c5 100644 --- a/compose.yaml +++ b/compose.yaml @@ -51,12 +51,15 @@ services: ctakes-covid: extends: ctakes-covid-base profiles: + - chart-review + - chart-review-gpu - covid-symptom - covid-symptom-gpu cnlpt-negation: image: smartonfhir/cnlp-transformers:negation-0.6.1-cpu profiles: + - chart-review - covid-symptom networks: - cumulus-etl @@ -64,6 +67,7 @@ services: cnlpt-negation-gpu: image: smartonfhir/cnlp-transformers:negation-0.6.1-gpu profiles: + - chart-review-gpu - covid-symptom-gpu networks: - cumulus-etl diff --git a/cumulus_etl/chart_review/cli.py b/cumulus_etl/chart_review/cli.py index a30f910b..cb7f1c33 100644 --- a/cumulus_etl/chart_review/cli.py +++ b/cumulus_etl/chart_review/cli.py @@ -20,7 +20,7 @@ def init_checks(args: argparse.Namespace): if args.nlp: nlp.check_ctakes() - nlp.check_cnlpt() + nlp.check_negation_cnlpt() if not cli_utils.is_url_available(args.label_studio_url, retry=False): errors.fatal( @@ -80,8 +80,18 @@ async def read_notes_from_ndjson( anon_enc_id = codebook.fake_id("Encounter", enc_id) doc_id = docrefs[i]["id"] doc_mappings = {doc_id: codebook.fake_id("DocumentReference", doc_id)} - - notes.append(LabelStudioNote(enc_id, anon_enc_id, doc_mappings, title, text)) + doc_spans = {doc_id: (0, len(text))} + + notes.append( + LabelStudioNote( + enc_id, + anon_enc_id, + doc_mappings=doc_mappings, + doc_spans=doc_spans, + title=title, + text=text, + ) + ) return notes @@ -101,7 +111,12 @@ async def run_nlp(notes: Collection[LabelStudioNote], args: argparse.Namespace) ctakes_json = await ctakesclient.client.extract(note.text, client=http_client) matches = ctakes_json.list_match(polarity=Polarity.pos) spans = ctakes_json.list_spans(matches) - cnlpt_results = await ctakesclient.transformer.list_polarity(note.text, spans, client=http_client) + cnlpt_results = await ctakesclient.transformer.list_polarity( + note.text, + spans, + client=http_client, + model=nlp.TransformerModel.NEGATION, + ) note.matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos] @@ -133,6 +148,7 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt grouped_text = "" grouped_matches = [] grouped_doc_mappings = {} + grouped_doc_spans = {} for note in enc_notes: grouped_doc_mappings.update(note.doc_mappings) @@ -145,14 +161,24 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt offset = len(grouped_text) grouped_text += note.text + offset_doc_spans = {k: (v[0] + offset, v[1] + offset) for k, v in note.doc_spans.items()} + grouped_doc_spans.update(offset_doc_spans) + for match in note.matches: match.begin += offset match.end += offset grouped_matches.append(match) - grouped_note = LabelStudioNote(enc_id, enc_notes[0].anon_id, grouped_doc_mappings, "", grouped_text) - grouped_note.matches = grouped_matches - grouped_notes.append(grouped_note) + grouped_notes.append( + LabelStudioNote( + enc_id, + enc_notes[0].anon_id, + text=grouped_text, + doc_mappings=grouped_doc_mappings, + doc_spans=grouped_doc_spans, + matches=grouped_matches, + ) + ) return grouped_notes diff --git a/cumulus_etl/chart_review/downloader.py b/cumulus_etl/chart_review/downloader.py index cb6a0c40..27f3cb90 100644 --- a/cumulus_etl/chart_review/downloader.py +++ b/cumulus_etl/chart_review/downloader.py @@ -68,7 +68,7 @@ async def _download_docrefs_from_real_ids( # Grab identifiers for which specific docrefs we need with common.read_csv(docref_csv) as reader: - docref_ids = {row["docref_id"] for row in reader} + docref_ids = sorted({row["docref_id"] for row in reader}) # Kick off a bunch of requests to the FHIR server for these documents coroutines = [_request_docref(client, docref_id) for docref_id in docref_ids] diff --git a/cumulus_etl/chart_review/labelstudio.py b/cumulus_etl/chart_review/labelstudio.py index a7939b94..e34686aa 100644 --- a/cumulus_etl/chart_review/labelstudio.py +++ b/cumulus_etl/chart_review/labelstudio.py @@ -1,5 +1,6 @@ """LabelStudio document annotation""" +import dataclasses from collections.abc import Collection, Iterable import ctakesclient.typesystem @@ -17,14 +18,26 @@ ############################################################################### +@dataclasses.dataclass class LabelStudioNote: - def __init__(self, enc_id: str, anon_id: str, doc_mappings: dict[str, str], title: str, text: str): - self.enc_id = enc_id - self.anon_id = anon_id - self.doc_mappings = doc_mappings - self.title = title - self.text = text - self.matches: list[ctakesclient.typesystem.MatchText] = [] + """Holds all the data that Label Studio will need for a single note (or a single grouped encounter note)""" + + enc_id: str # real Encounter ID + anon_id: str # anonymized Encounter ID + text: str = "" # text of the note, sent to Label Studio + + # A title is only used when combining notes into one big encounter note. It's not sent to Label Studio. + title: str = "" + + # Doc mappings is a dict of real DocRef ID -> anonymized DocRef ID of all contained notes, in order + doc_mappings: dict[str, str] = dataclasses.field(default_factory=dict) + + # Doc spans indicate which bits of the text come from which DocRef - it will map real DocRef ID to a pair of + # "first character" (0-based) and "last character" (0-based, exclusive) - just like cTAKES match text spans. + doc_spans: dict[str, tuple[int, int]] = dataclasses.field(default_factory=dict) + + # Matches found by cTAKES + matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list) class LabelStudioClient: @@ -84,6 +97,7 @@ def _format_task_for_note(self, note: LabelStudioNote) -> dict: "enc_id": note.enc_id, "anon_id": note.anon_id, "docref_mappings": note.doc_mappings, + "docref_spans": {k: list(v) for k, v in note.doc_spans.items()}, # json doesn't natively have tuples }, } diff --git a/cumulus_etl/nlp/__init__.py b/cumulus_etl/nlp/__init__.py index 541fa689..783a16a7 100644 --- a/cumulus_etl/nlp/__init__.py +++ b/cumulus_etl/nlp/__init__.py @@ -1,6 +1,6 @@ """Support code for NLP servers""" -from .extract import ctakes_extract, ctakes_httpx_client, list_polarity +from .extract import TransformerModel, ctakes_extract, ctakes_httpx_client, list_polarity from .huggingface import hf_prompt, hf_info, llama2_prompt from .utils import cache_wrapper, is_docref_valid from .watcher import check_negation_cnlpt, check_term_exists_cnlpt, check_ctakes, restart_ctakes_with_bsv diff --git a/docs/chart-review.md b/docs/chart-review.md index 336fbda2..0392aab6 100644 --- a/docs/chart-review.md +++ b/docs/chart-review.md @@ -17,7 +17,9 @@ Along the way, it can mark the note with NLP results and/or anonymize the note w This is useful for not just actual chart reviews, but also for developing a custom NLP dictionary. You can feed Cumulus ETL a custom NLP dictionary, review how it performs, and iterate upon it. -## Preliminary Label Studio Setup +## Preliminaries + +### Label Studio Setup This guide assumes you already have a local instance of Label Studio running. They offer Docker images and reasonable @@ -27,6 +29,19 @@ If you haven't set that up yet, go do that and come back. The Cumulus team can help you with setting it up if you come talk to us, but the rest of this guide will mostly deal with chart review mode itself. +### Dependent Services + +Some features of chart review mode need external services (like cTAKES to run NLP). +Launch those before you begin using chart review: + +```shell +export UMLS_API_KEY=your-umls-api-key +docker compose --profile chart-review up -d +``` + +Or if you have access to a GPU, +you can speed up the NLP by launching the GPU profile instead with `--profile chart-review-gpu`. + ## Basic Operation At its core, chart review mode is just another ETL (extract, transform, load) operation. diff --git a/tests/chart_review/__init__.py b/tests/chart_review/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_chart_cli.py b/tests/chart_review/test_chart_cli.py similarity index 86% rename from tests/test_chart_cli.py rename to tests/chart_review/test_chart_cli.py index 989cedfb..38b5bd43 100644 --- a/tests/test_chart_cli.py +++ b/tests/chart_review/test_chart_cli.py @@ -105,7 +105,7 @@ async def run_chart_review( await cli.main(args) @staticmethod - def make_docref(doc_id: str, text: str = None, content: list[dict] = None) -> dict: + def make_docref(doc_id: str, text: str = None, content: list[dict] = None, enc_id: str = None) -> dict: if content is None: text = text or "What's up doc?" content = [ @@ -117,11 +117,12 @@ def make_docref(doc_id: str, text: str = None, content: list[dict] = None) -> di } ] + enc_id = enc_id or f"enc-{doc_id}" return { "resourceType": "DocumentReference", "id": doc_id, "content": content, - "context": {"encounter": [{"reference": f"Encounter/enc-{doc_id}"}]}, + "context": {"encounter": [{"reference": f"Encounter/{enc_id}"}]}, } @staticmethod @@ -332,9 +333,7 @@ async def test_disabled_nlp(self): @ddt.data(True, False) async def test_philter(self, run_philter): - notes = [ - LabelStudioNote("EncID", "EncAnon", {"DocID": "DocAnon"}, "My Title", "John Smith called on 10/13/2010") - ] + notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")] with mock.patch("cumulus_etl.chart_review.cli.read_notes_from_ndjson", return_value=notes): await self.run_chart_review(philter=run_philter) @@ -350,3 +349,44 @@ async def test_philter(self, run_philter): else: expected_text = "John Smith called on 10/13/2010" self.assertEqual(self.wrap_note("My Title", expected_text), task.text) + + @respx.mock(assert_all_mocked=False) + async def test_combined_encounter_offsets(self, respx_mock): + # use server notes just for ease of making fake ones + self.mock_read_url(respx_mock, "D1", enc_id="43") + self.mock_read_url(respx_mock, "D2", enc_id="43") + respx_mock.post(os.environ["URL_CTAKES_REST"]).pass_through() # ignore cTAKES + + with tempfile.NamedTemporaryFile() as file: + self.write_real_docrefs(file.name, ["D1", "D2"]) + await self.run_chart_review(input_path="https://localhost", docrefs=file.name) + + notes = self.ls_client.push_tasks.call_args[0][0] + self.assertEqual(1, len(notes)) + note = notes[0] + + # Did we mark that both IDs occur in one note correctly? + self.assertEqual({"D1": ANON_D1, "D2": ANON_D2}, note.doc_mappings) + + # Did we mark the internal docref spans correctly? + first_span = (93, 107) + second_span = (285, 299) + self.assertEqual("What's up doc?", note.text[first_span[0] : first_span[1]]) + self.assertEqual("What's up doc?", note.text[second_span[0] : second_span[1]]) + self.assertEqual({"D1": first_span, "D2": second_span}, note.doc_spans) + + # Did we edit cTAKES results correctly? + match1a = (93, 99) + match1b = (100, 102) + match1c = (103, 107) + match2a = (285, 291) + match2b = (292, 294) + match2c = (295, 299) + self.assertEqual("What's", note.text[match1a[0] : match1a[1]]) + self.assertEqual("up", note.text[match1b[0] : match1b[1]]) + self.assertEqual("doc?", note.text[match1c[0] : match1c[1]]) + self.assertEqual("What's", note.text[match2a[0] : match2a[1]]) + self.assertEqual("up", note.text[match2b[0] : match2b[1]]) + self.assertEqual("doc?", note.text[match2c[0] : match2c[1]]) + spans = {x.span().key() for x in note.matches} + self.assertEqual({match1a, match1b, match1c, match2a, match2b, match2c}, spans) diff --git a/tests/test_chart_labelstudio.py b/tests/chart_review/test_chart_labelstudio.py similarity index 93% rename from tests/test_chart_labelstudio.py rename to tests/chart_review/test_chart_labelstudio.py index 5c59a882..a7eee71b 100644 --- a/tests/test_chart_labelstudio.py +++ b/tests/chart_review/test_chart_labelstudio.py @@ -26,7 +26,14 @@ def setUp(self): @staticmethod def make_note(*, enc_id: str = "enc", matches: bool = True) -> LabelStudioNote: - note = LabelStudioNote(enc_id, "enc-anon", {"doc": "doc-anon"}, "Ignored Title", "Normal note text") + text = "Normal note text" + note = LabelStudioNote( + enc_id, + "enc-anon", + doc_mappings={"doc": "doc-anon"}, + doc_spans={"doc": (0, len(text))}, + text=text, + ) if matches: note.matches = ctakesmock.fake_ctakes_extract(note.text).list_match(polarity=Polarity.pos) return note @@ -61,6 +68,7 @@ def test_basic_push(self): "enc_id": "enc", "anon_id": "enc-anon", "docref_mappings": {"doc": "doc-anon"}, + "docref_spans": {"doc": [0, 16]}, }, "predictions": [ { @@ -109,6 +117,7 @@ def test_no_matches(self): "enc_id": "enc", "anon_id": "enc-anon", "docref_mappings": {"doc": "doc-anon"}, + "docref_spans": {"doc": [0, 16]}, }, "predictions": [ { @@ -132,6 +141,7 @@ def test_dynamic_labels(self, label_type): "enc_id": "enc", "anon_id": "enc-anon", "docref_mappings": {"doc": "doc-anon"}, + "docref_spans": {"doc": [0, 16]}, "mylabel": [ {"value": "Itch"}, {"value": "Nausea"}, @@ -151,6 +161,7 @@ def test_dynamic_labels_no_matches(self): "enc_id": "enc", "anon_id": "enc-anon", "docref_mappings": {"doc": "doc-anon"}, + "docref_spans": {"doc": [0, 16]}, "mylabel": [], # this needs to be sent, or the server will complain }, self.get_pushed_task()["data"],