Skip to content

Commit

Permalink
feat(chart-review): push internal docref spans to Label Studio
Browse files Browse the repository at this point in the history
In case we need to know the internal placing of doc notes,
when we gather multiple notes into one note, push up the start &
end of those texts.

No planned use for this right now, but I want to get this in so
when we do want it, the metadata already exists.
  • Loading branch information
mikix committed Oct 26, 2023
1 parent a12aa01 commit 05321d5
Show file tree
Hide file tree
Showing 9 changed files with 131 additions and 23 deletions.
4 changes: 4 additions & 0 deletions compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,19 +51,23 @@ services:
ctakes-covid:
extends: ctakes-covid-base
profiles:
- chart-review
- chart-review-gpu
- covid-symptom
- covid-symptom-gpu

cnlpt-negation:
image: smartonfhir/cnlp-transformers:negation-0.6.1-cpu
profiles:
- chart-review
- covid-symptom
networks:
- cumulus-etl

cnlpt-negation-gpu:
image: smartonfhir/cnlp-transformers:negation-0.6.1-gpu
profiles:
- chart-review-gpu
- covid-symptom-gpu
networks:
- cumulus-etl
Expand Down
40 changes: 33 additions & 7 deletions cumulus_etl/chart_review/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def init_checks(args: argparse.Namespace):

if args.nlp:
nlp.check_ctakes()
nlp.check_cnlpt()
nlp.check_negation_cnlpt()

if not cli_utils.is_url_available(args.label_studio_url, retry=False):
errors.fatal(
Expand Down Expand Up @@ -80,8 +80,18 @@ async def read_notes_from_ndjson(
anon_enc_id = codebook.fake_id("Encounter", enc_id)
doc_id = docrefs[i]["id"]
doc_mappings = {doc_id: codebook.fake_id("DocumentReference", doc_id)}

notes.append(LabelStudioNote(enc_id, anon_enc_id, doc_mappings, title, text))
doc_spans = {doc_id: (0, len(text))}

notes.append(
LabelStudioNote(
enc_id,
anon_enc_id,
doc_mappings=doc_mappings,
doc_spans=doc_spans,
title=title,
text=text,
)
)

return notes

Expand All @@ -101,7 +111,12 @@ async def run_nlp(notes: Collection[LabelStudioNote], args: argparse.Namespace)
ctakes_json = await ctakesclient.client.extract(note.text, client=http_client)
matches = ctakes_json.list_match(polarity=Polarity.pos)
spans = ctakes_json.list_spans(matches)
cnlpt_results = await ctakesclient.transformer.list_polarity(note.text, spans, client=http_client)
cnlpt_results = await ctakesclient.transformer.list_polarity(
note.text,
spans,
client=http_client,
model=nlp.TransformerModel.NEGATION,
)
note.matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos]


Expand Down Expand Up @@ -133,6 +148,7 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
grouped_text = ""
grouped_matches = []
grouped_doc_mappings = {}
grouped_doc_spans = {}

for note in enc_notes:
grouped_doc_mappings.update(note.doc_mappings)
Expand All @@ -145,14 +161,24 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
offset = len(grouped_text)
grouped_text += note.text

offset_doc_spans = {k: (v[0] + offset, v[1] + offset) for k, v in note.doc_spans.items()}
grouped_doc_spans.update(offset_doc_spans)

for match in note.matches:
match.begin += offset
match.end += offset
grouped_matches.append(match)

grouped_note = LabelStudioNote(enc_id, enc_notes[0].anon_id, grouped_doc_mappings, "", grouped_text)
grouped_note.matches = grouped_matches
grouped_notes.append(grouped_note)
grouped_notes.append(
LabelStudioNote(
enc_id,
enc_notes[0].anon_id,
text=grouped_text,
doc_mappings=grouped_doc_mappings,
doc_spans=grouped_doc_spans,
matches=grouped_matches,
)
)

return grouped_notes

Expand Down
2 changes: 1 addition & 1 deletion cumulus_etl/chart_review/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ async def _download_docrefs_from_real_ids(

# Grab identifiers for which specific docrefs we need
with common.read_csv(docref_csv) as reader:
docref_ids = {row["docref_id"] for row in reader}
docref_ids = sorted({row["docref_id"] for row in reader})

# Kick off a bunch of requests to the FHIR server for these documents
coroutines = [_request_docref(client, docref_id) for docref_id in docref_ids]
Expand Down
26 changes: 19 additions & 7 deletions cumulus_etl/chart_review/labelstudio.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""LabelStudio document annotation"""

import dataclasses
from collections.abc import Collection, Iterable

import ctakesclient.typesystem
Expand All @@ -17,14 +18,24 @@
###############################################################################


@dataclasses.dataclass
class LabelStudioNote:
def __init__(self, enc_id: str, anon_id: str, doc_mappings: dict[str, str], title: str, text: str):
self.enc_id = enc_id
self.anon_id = anon_id
self.doc_mappings = doc_mappings
self.title = title
self.text = text
self.matches: list[ctakesclient.typesystem.MatchText] = []
enc_id: str # real Encounter ID
anon_id: str # anonymized Encounter ID
text: str = "" # text of the note, sent to Label Studio

# A title is only used when combining notes into one big encounter note. It's not sent to Label Studio.
title: str = ""

# Doc mappings is a dict of real DocRef ID -> anonymized DocRef ID of all contained notes, in order
doc_mappings: dict[str, str] = dataclasses.field(default_factory=dict)

# Doc spans indicate which bits of the text come from which DocRef - it will map real DocRef ID to a pair of
# "first character" (0-based) and "last character" (0-based, exclusive) - just like cTAKES match text spans.
doc_spans: dict[str, tuple[int, int]] = dataclasses.field(default_factory=dict)

# Matches found by cTAKES
matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list)


class LabelStudioClient:
Expand Down Expand Up @@ -84,6 +95,7 @@ def _format_task_for_note(self, note: LabelStudioNote) -> dict:
"enc_id": note.enc_id,
"anon_id": note.anon_id,
"docref_mappings": note.doc_mappings,
"docref_spans": {k: list(v) for k, v in note.doc_spans.items()}, # json doesn't natively have tuples
},
}

Expand Down
2 changes: 1 addition & 1 deletion cumulus_etl/nlp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Support code for NLP servers"""

from .extract import ctakes_extract, ctakes_httpx_client, list_polarity
from .extract import TransformerModel, ctakes_extract, ctakes_httpx_client, list_polarity
from .huggingface import hf_prompt, hf_info, llama2_prompt
from .utils import cache_wrapper, is_docref_valid
from .watcher import check_negation_cnlpt, check_term_exists_cnlpt, check_ctakes, restart_ctakes_with_bsv
17 changes: 16 additions & 1 deletion docs/chart-review.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ Along the way, it can mark the note with NLP results and/or anonymize the note w
This is useful for not just actual chart reviews, but also for developing a custom NLP dictionary.
You can feed Cumulus ETL a custom NLP dictionary, review how it performs, and iterate upon it.

## Preliminary Label Studio Setup
## Preliminaries

### Label Studio Setup

This guide assumes you already have a local instance of Label Studio running.
They offer Docker images and reasonable
Expand All @@ -27,6 +29,19 @@ If you haven't set that up yet, go do that and come back.
The Cumulus team can help you with setting it up if you come talk to us,
but the rest of this guide will mostly deal with chart review mode itself.

### Dependent Services

Some features of chart review mode need external services (like cTAKES to run NLP).
Launch those before you begin using chart review:

```shell
export UMLS_API_KEY=your-umls-api-key
docker compose --profile chart-review up -d
```

Or if you have access to a GPU,
you can speed up the NLP by launching the GPU profile instead with `--profile chart-review-gpu`.

## Basic Operation

At its core, chart review mode is just another ETL (extract, transform, load) operation.
Expand Down
Empty file added tests/chart_review/__init__.py
Empty file.
50 changes: 45 additions & 5 deletions tests/test_chart_cli.py → tests/chart_review/test_chart_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ async def run_chart_review(
await cli.main(args)

@staticmethod
def make_docref(doc_id: str, text: str = None, content: list[dict] = None) -> dict:
def make_docref(doc_id: str, text: str = None, content: list[dict] = None, enc_id: str = None) -> dict:
if content is None:
text = text or "What's up doc?"
content = [
Expand All @@ -117,11 +117,12 @@ def make_docref(doc_id: str, text: str = None, content: list[dict] = None) -> di
}
]

enc_id = enc_id or f"enc-{doc_id}"
return {
"resourceType": "DocumentReference",
"id": doc_id,
"content": content,
"context": {"encounter": [{"reference": f"Encounter/enc-{doc_id}"}]},
"context": {"encounter": [{"reference": f"Encounter/{enc_id}"}]},
}

@staticmethod
Expand Down Expand Up @@ -332,9 +333,7 @@ async def test_disabled_nlp(self):

@ddt.data(True, False)
async def test_philter(self, run_philter):
notes = [
LabelStudioNote("EncID", "EncAnon", {"DocID": "DocAnon"}, "My Title", "John Smith called on 10/13/2010")
]
notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")]
with mock.patch("cumulus_etl.chart_review.cli.read_notes_from_ndjson", return_value=notes):
await self.run_chart_review(philter=run_philter)

Expand All @@ -350,3 +349,44 @@ async def test_philter(self, run_philter):
else:
expected_text = "John Smith called on 10/13/2010"
self.assertEqual(self.wrap_note("My Title", expected_text), task.text)

@respx.mock(assert_all_mocked=False)
async def test_combined_encounter_offsets(self, respx_mock):
# use server notes just for ease of making fake ones
self.mock_read_url(respx_mock, "D1", enc_id="43")
self.mock_read_url(respx_mock, "D2", enc_id="43")
respx_mock.post(os.environ["URL_CTAKES_REST"]).pass_through() # ignore cTAKES

with tempfile.NamedTemporaryFile() as file:
self.write_real_docrefs(file.name, ["D1", "D2"])
await self.run_chart_review(input_path="https://localhost", docrefs=file.name)

notes = self.ls_client.push_tasks.call_args[0][0]
self.assertEqual(1, len(notes))
note = notes[0]

# Did we mark that both IDs occur in one note correctly?
self.assertEqual({"D1": ANON_D1, "D2": ANON_D2}, note.doc_mappings)

# Did we mark the internal docref spans correctly?
first_span = (93, 107)
second_span = (285, 299)
self.assertEqual("What's up doc?", note.text[first_span[0] : first_span[1]])
self.assertEqual("What's up doc?", note.text[second_span[0] : second_span[1]])
self.assertEqual({"D1": first_span, "D2": second_span}, note.doc_spans)

# Did we edit cTAKES results correctly?
match1a = (93, 99)
match1b = (100, 102)
match1c = (103, 107)
match2a = (285, 291)
match2b = (292, 294)
match2c = (295, 299)
self.assertEqual("What's", note.text[match1a[0] : match1a[1]])
self.assertEqual("up", note.text[match1b[0] : match1b[1]])
self.assertEqual("doc?", note.text[match1c[0] : match1c[1]])
self.assertEqual("What's", note.text[match2a[0] : match2a[1]])
self.assertEqual("up", note.text[match2b[0] : match2b[1]])
self.assertEqual("doc?", note.text[match2c[0] : match2c[1]])
spans = {x.span().key() for x in note.matches}
self.assertEqual({match1a, match1b, match1c, match2a, match2b, match2c}, spans)
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,14 @@ def setUp(self):

@staticmethod
def make_note(*, enc_id: str = "enc", matches: bool = True) -> LabelStudioNote:
note = LabelStudioNote(enc_id, "enc-anon", {"doc": "doc-anon"}, "Ignored Title", "Normal note text")
text = "Normal note text"
note = LabelStudioNote(
enc_id,
"enc-anon",
doc_mappings={"doc": "doc-anon"},
doc_spans={"doc": (0, len(text))},
text=text,
)
if matches:
note.matches = ctakesmock.fake_ctakes_extract(note.text).list_match(polarity=Polarity.pos)
return note
Expand Down Expand Up @@ -61,6 +68,7 @@ def test_basic_push(self):
"enc_id": "enc",
"anon_id": "enc-anon",
"docref_mappings": {"doc": "doc-anon"},
"docref_spans": {"doc": [0, 16]},
},
"predictions": [
{
Expand Down Expand Up @@ -109,6 +117,7 @@ def test_no_matches(self):
"enc_id": "enc",
"anon_id": "enc-anon",
"docref_mappings": {"doc": "doc-anon"},
"docref_spans": {"doc": [0, 16]},
},
"predictions": [
{
Expand All @@ -132,6 +141,7 @@ def test_dynamic_labels(self, label_type):
"enc_id": "enc",
"anon_id": "enc-anon",
"docref_mappings": {"doc": "doc-anon"},
"docref_spans": {"doc": [0, 16]},
"mylabel": [
{"value": "Itch"},
{"value": "Nausea"},
Expand All @@ -151,6 +161,7 @@ def test_dynamic_labels_no_matches(self):
"enc_id": "enc",
"anon_id": "enc-anon",
"docref_mappings": {"doc": "doc-anon"},
"docref_spans": {"doc": [0, 16]},
"mylabel": [], # this needs to be sent, or the server will complain
},
self.get_pushed_task()["data"],
Expand Down

0 comments on commit 05321d5

Please sign in to comment.