Skip to content

Commit

Permalink
fix: hand symptoms to cTAKES in a format it expects
Browse files Browse the repository at this point in the history
Ever since the March 17th, 2023 commit that fed cTAKES a custom bsv
file, our COVID NLP performance has taken a hit (unbeknownst to us).

cTAKES actually expects a slightly different format for symptoms bsv
files, which we now enforce as we send it files.

This gets our COVID NLP performance on a standard BCH set of 200-odd
Cerner html notes from 0.784 to 0.813 F1.

This commit bumps the covid tasks' task_format from 3 to 4.
  • Loading branch information
mikix committed Jan 4, 2024
1 parent 38497d9 commit 04478ff
Show file tree
Hide file tree
Showing 6 changed files with 52 additions and 12 deletions.
8 changes: 7 additions & 1 deletion cumulus_etl/etl/studies/covid_symptom/covid_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,14 @@ class BaseCovidSymptomNlpResultsTask(tasks.BaseNlpTask):

# Use a shared task_version for subclasses, to make sharing the ctakes cache folder easier
# (and they use essentially the same services anyway)
task_version = 3
task_version = 4
# Task Version History:
# ** 4 (2024-01): Fixed bug preventing our cTAKES symptoms file from having any effect **
# cTAKES: smartonfhir/ctakes-covid:1.1.0
# cNLP: smartonfhir/cnlp-transformers:negation-0.6.1
# cNLP: smartonfhir/cnlp-transformers:termexists-0.6.1
# ctakesclient: 5.0
#
# ** 3 (2023-09): Updated to cnlpt version 0.6.1 **
# cTAKES: smartonfhir/ctakes-covid:1.1.0
# cNLP: smartonfhir/cnlp-transformers:negation-0.6.1
Expand Down
40 changes: 37 additions & 3 deletions cumulus_etl/nlp/watcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
import shutil
import socket
import sys
import tempfile
import time
import urllib.parse

import ctakesclient

from cumulus_etl import cli_utils, errors
from cumulus_etl import cli_utils, common, errors


def check_ctakes() -> None:
Expand Down Expand Up @@ -94,6 +95,33 @@ def wait_for_ctakes_restart():
check_ctakes()


def _convert_bsv_file_to_ctakes_format(bsv_path: str, output_path: str) -> None:
"""
Reads the input bsv file and converts it to a cTAKES compatible version.
cTAKES only expects to see 4 columns: CUI|TUI|STR|PREF
But ctakesclient holds a more comprehensive 6-column version: CUI|TUI|CODE|SAB|STR|PREF
The additional two fields are:
CODE = Vocabulary Code
SAB = Vocabulary Source Abbreviation (SNOMEDCT_US)
"""
bsv_lines = []
for line in common.read_text(bsv_path).splitlines():
if not line.strip() or line.startswith("#"):
continue

columns = line.split("|")
if len(columns) > 4:
# Keep first two and last two columns
bsv_lines.append(f"{columns[0]}|{columns[1]}|{columns[-2]}|{columns[-1]}")
elif len(columns) == 4:
bsv_lines.append(line)

final_bsv = "\n".join(bsv_lines)
common.write_text(output_path, final_bsv)


def restart_ctakes_with_bsv(ctakes_overrides: str, bsv_path: str) -> bool:
"""Hands a new bsv over to cTAKES and waits for it to restart and be ready again with the new bsv file"""
# This whole setup is slightly janky. But it is designed with these constraints:
Expand Down Expand Up @@ -127,6 +155,12 @@ def restart_ctakes_with_bsv(ctakes_overrides: str, bsv_path: str) -> bool:
)
return False

with wait_for_ctakes_restart():
shutil.copyfile(bsv_path, os.path.join(ctakes_overrides, "symptoms.bsv"))
# First, coerce the bsv contents into a cTAKES compatible format.
with tempfile.NamedTemporaryFile() as tmp_bsv:
_convert_bsv_file_to_ctakes_format(bsv_path, tmp_bsv.name)

# Now copy that modified file into its final location inside cTAKES
with wait_for_ctakes_restart():
shutil.copyfile(tmp_bsv.name, os.path.join(ctakes_overrides, "symptoms.bsv"))

return True
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies = [
"inscriptis < 3",
"jwcrypto < 2",
"label-studio-sdk < 1",
"oracledb < 2",
"oracledb < 3",
"philter-lite < 1",
"pyarrow < 15",
"rich < 14",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{"id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d.0", "docref_id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_id": "b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_id": "00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 3, "match": {"begin": 6, "end": 9, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "386661006", "cui": "C0015967", "codingScheme": "SNOMEDCT_US", "tui": "T184"}, {"code": "50177009", "cui": "C0015967", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
{"id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d.1", "docref_id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_id": "b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_id": "00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 3, "match": {"begin": 6, "end": 9, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "422587007", "cui": "C0027497", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
{"id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e.0", "docref_id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e", "encounter_id": "58a65c6cc5693a507af44f25f062171898aa6bc469766956b2c802d39fc6d4a7", "subject_id": "84cc1e7381070fda74a80df28a29323101be3b2c26b4d604abf43946ab1759f6", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 3, "match": {"begin": 7, "end": 10, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "386661006", "cui": "C0015967", "codingScheme": "SNOMEDCT_US", "tui": "T184"}, {"code": "50177009", "cui": "C0015967", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
{"id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e.1", "docref_id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e", "encounter_id": "58a65c6cc5693a507af44f25f062171898aa6bc469766956b2c802d39fc6d4a7", "subject_id": "84cc1e7381070fda74a80df28a29323101be3b2c26b4d604abf43946ab1759f6", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 3, "match": {"begin": 7, "end": 10, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "422587007", "cui": "C0027497", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
{"id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d.0", "docref_id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_id": "b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_id": "00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 4, "match": {"begin": 6, "end": 9, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "386661006", "cui": "C0015967", "codingScheme": "SNOMEDCT_US", "tui": "T184"}, {"code": "50177009", "cui": "C0015967", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
{"id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d.1", "docref_id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_id": "b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_id": "00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 4, "match": {"begin": 6, "end": 9, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "422587007", "cui": "C0027497", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
{"id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e.0", "docref_id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e", "encounter_id": "58a65c6cc5693a507af44f25f062171898aa6bc469766956b2c802d39fc6d4a7", "subject_id": "84cc1e7381070fda74a80df28a29323101be3b2c26b4d604abf43946ab1759f6", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 4, "match": {"begin": 7, "end": 10, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "386661006", "cui": "C0015967", "codingScheme": "SNOMEDCT_US", "tui": "T184"}, {"code": "50177009", "cui": "C0015967", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
{"id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e.1", "docref_id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e", "encounter_id": "58a65c6cc5693a507af44f25f062171898aa6bc469766956b2c802d39fc6d4a7", "subject_id": "84cc1e7381070fda74a80df28a29323101be3b2c26b4d604abf43946ab1759f6", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 4, "match": {"begin": 7, "end": 10, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "422587007", "cui": "C0027497", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
{"id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d.1", "docref_id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_id": "b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_id": "00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 3, "match": {"begin": 6, "end": 9, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "422587007", "cui": "C0027497", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
{"id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e.1", "docref_id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e", "encounter_id": "58a65c6cc5693a507af44f25f062171898aa6bc469766956b2c802d39fc6d4a7", "subject_id": "84cc1e7381070fda74a80df28a29323101be3b2c26b4d604abf43946ab1759f6", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 3, "match": {"begin": 7, "end": 10, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "422587007", "cui": "C0027497", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
{"id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d.1", "docref_id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_id": "b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_id": "00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 4, "match": {"begin": 6, "end": 9, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "422587007", "cui": "C0027497", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
{"id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e.1", "docref_id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e", "encounter_id": "58a65c6cc5693a507af44f25f062171898aa6bc469766956b2c802d39fc6d4a7", "subject_id": "84cc1e7381070fda74a80df28a29323101be3b2c26b4d604abf43946ab1759f6", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 4, "match": {"begin": 7, "end": 10, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "422587007", "cui": "C0027497", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
2 changes: 1 addition & 1 deletion tests/etl/test_etl_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ async def test_etl_job_s3(self):
class TestEtlNlp(BaseEtlSimple):
"""Test case for the cTAKES/cNLP responses"""

CACHE_FOLDER = "covid_symptom_v3"
CACHE_FOLDER = "covid_symptom_v4"

def setUp(self):
super().setUp()
Expand Down

0 comments on commit 04478ff

Please sign in to comment.