Merge pull request #66 from globusmedical/fix_unit_test_squashed

Fixes for SQ type and unit tests, squashed from PR 64
KitwareMedical · Mar 13, 2024 · b37a883 · b37a883
2 parents 3ffe350 + 097b554
commit b37a883
Show file tree

Hide file tree

Showing 9 changed files with 195 additions and 93 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,6 @@ __pycache__
 .vscode
 build
 *.egg-info
-.python-version
+.python-version
+# Ignore Pipfile.lock, since different versions of python and OS produce different hashes
+Pipfile.lock
diff --git a/Pipfile b/Pipfile
@@ -0,0 +1,15 @@
+
+# pipenv is useful for developing local package in 
+# editable mode. See https://pypi.org/project/pipenv/
+
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+dicom-anonymizer = {file = ".", editable = true}
+
+[dev-packages]
+pytest = "*"
+setuptools = "*"
diff --git a/cspell.json b/cspell.json
@@ -0,0 +1,50 @@
+// cSpell Settings
+{
+    // Version of the setting file.  Always 0.2
+    "version": "0.2",
+    // language - current active spelling language
+    "language": "en",
+    // ignorepaths - list of glob patterns for files to ignore
+    "ignorePaths": [
+        "ext",
+        "*.json",
+        "Pipfile.lock",
+        "**/tests"
+    ],
+    // words - list of words to be always considered correct
+    "words": [
+        "anonymization",
+        "argparse",
+        "bdist",
+        "behaviour",
+        "dcmread",
+        "DICM",
+        "DICOM",
+        "dicomanonymizer",
+        "dicomfields",
+        "Edern",
+        "Fiducials",
+        "Fontaine",
+        "Haumont",
+        "Kitware",
+        "Laurenn",
+        "multival",
+        "pipenv",
+        "Pipfile",
+        "pydicom",
+        "pytest",
+        "Radiopharmaceutical",
+        "setuptools",
+        "simpledicomanonymizer",
+        "tqdm",
+        "venv",
+        "virtualenv",
+        "xgggg"
+    ]
+    // flagWords - list of words to be always considered incorrect
+    // This is useful for offensive words and common spelling errors.
+    // For example "hte" should be "the"
+    // "flagWords": [
+    //     "hte"
+    // ]
+}
diff --git a/dicomanonymizer/anonymizer.py b/dicomanonymizer/anonymizer.py
@@ -26,7 +26,7 @@ def anonymize(input_path: str, output_path: str, anonymization_actions: dict, de
     Read data from input path (folder or file) and launch the anonymization.
 
     :param input_path: Path to a folder or to a file. If set to a folder,
-    then cross all over subfiles and apply anonymization.
+    then cross all over sub-folders and apply anonymization.
     :param output_path: Path to a folder or to a file.
     :param anonymization_actions: List of actions that will be applied on tags.
     :param deletePrivateTags: Whether to delete private tags.

diff --git a/dicomanonymizer/simpledicomanonymizer.py b/dicomanonymizer/simpledicomanonymizer.py
@@ -105,7 +105,7 @@ def replace_element(element):
     See https://laurelbridge.com/pdf/Dicom-Anonymization-Conformance-Statement.pdf
     """
     if element.VR in ('LO', 'LT', 'SH', 'PN', 'CS', 'ST', 'UT'):
-        element.value = 'Anonymized'
+        element.value = 'ANONYMIZED' # CS VR accepts only uppercase characters
     elif element.VR == 'UI':
         replace_element_UID(element)
     elif element.VR in ('DS', 'IS'):
@@ -118,8 +118,16 @@ def replace_element(element):
         element.value = b'Anonymized'
     elif element.VR == 'SQ':
         for sub_dataset in element.value:
-            for sub_element in sub_dataset.elements():
-                replace_element(sub_element)
+            for sub_element in sub_dataset.elements():                
+                if isinstance(sub_element, pydicom.dataelem.RawDataElement):
+                    # RawDataElement is a NamedTuple, so cannot set its value attribute.
+                    # Convert it to a DataElement, replace value, and set it back.
+                    # Found in https://github.com/KitwareMedical/dicom-anonymizer/issues/63
+                    e2 = pydicom.dataelem.DataElement_from_raw(sub_element)
+                    replace_element(e2)
+                    sub_dataset.add(e2)
+                else:
+                    replace_element(sub_element)
     else:
         raise NotImplementedError('Not anonymized. VR {} not yet implemented.'.format(element.VR))
 

diff --git a/examples/anonymize_dataset.py b/examples/anonymize_dataset.py
@@ -2,13 +2,17 @@
 from pydicom.data import get_testdata_file
 from pydicom import dcmread
 
+
 def main():
     original_ds = dcmread(get_testdata_file("CT_small.dcm"))
     data_ds = original_ds.copy()
-    anonymize_dataset(data_ds, delete_private_tags=True) # Anonymization is done in-place
+    anonymize_dataset(
+        data_ds, delete_private_tags=True
+    )  # Anonymization is done in-place
     print("Examples of original -> anonymized values:")
     for tt in ["PatientName", "PatientID", "StudyDate"]:
         print(f"  {tt}: '{original_ds[tt].value}' -> '{data_ds[tt].value}'")
 
+
 if __name__ == "__main__":
     main()
diff --git a/examples/anonymize_extra_rules.py b/examples/anonymize_extra_rules.py
@@ -1,11 +1,22 @@
 import argparse
 from dicomanonymizer import ALL_TAGS, anonymize, keep
 
+
 def main():
     parser = argparse.ArgumentParser(add_help=True)
-    parser.add_argument('input', help='Path to the input dicom file or input directory which contains dicom files')
-    parser.add_argument('output', help='Path to the output dicom file or output directory which will contains dicom files')
-    parser.add_argument('--suffix', action='store', help='Suffix that will be added at the end of series description')
+    parser.add_argument(
+        "input",
+        help="Path to the input dicom file or input directory which contains dicom files",
+    )
+    parser.add_argument(
+        "output",
+        help="Path to the output dicom file or output directory which will contains dicom files",
+    )
+    parser.add_argument(
+        "--suffix",
+        action="store",
+        help="Suffix that will be added at the end of series description",
+    )
     args = parser.parse_args()
 
     input_dicom_path = args.input
@@ -16,7 +27,7 @@ def main():
     def setup_series_description(dataset, tag):
         element = dataset.get(tag)
         if element is not None:
-            element.value = f'{element.value}-{args.suffix}'
+            element.value = f"{element.value}-{args.suffix}"
 
     # ALL_TAGS variable is defined on file dicomfields.py
     # the 'keep' method is already defined into the dicom-anonymizer
@@ -28,7 +39,13 @@ def setup_series_description(dataset, tag):
         extra_anonymization_rules[(0x0008, 0x103E)] = setup_series_description
 
     # Launch the anonymization
-    anonymize(input_dicom_path, output_dicom_path, extra_anonymization_rules, delete_private_tags=False)
+    anonymize(
+        input_dicom_path,
+        output_dicom_path,
+        extra_anonymization_rules,
+        delete_private_tags=False,
+    )
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/tests/test_anon.py b/tests/test_anon.py
@@ -16,7 +16,7 @@
 warnings.filterwarnings("ignore")
 
 
-def get_all_failed():
+def get_all_failed():  # sourcery skip: inline-immediately-returned-variable
     # The following files are intended to fail dcmread
     # No point including them for anonymization testing
     dcmread_failed = [
@@ -27,50 +27,7 @@ def get_all_failed():
         "OT-PAL-8-face.dcm",
     ]
 
-    # TODO: Investigate why these fail replacement test of anonymization
-    replaced_failed = [
-        "693_J2KI.dcm",
-        "JPEG-lossy.dcm",
-        "JPEG2000-embedded-sequence-delimiter.dcm",
-        "JPEG2000.dcm",
-        "JPGExtended.dcm",
-        "reportsi.dcm",
-        "reportsi_with_empty_number_tags.dcm",
-        "SC_rgb_gdcm_KY.dcm",
-        "SC_rgb_jpeg_lossy_gdcm.dcm",
-        "693_UNCI.dcm",
-        "JPEG-LL.dcm",
-        "JPEG2000_UNC.dcm",
-        "MR2_J2KI.dcm",
-        "MR2_J2KR.dcm",
-        "MR2_UNCI.dcm",
-        "RG1_J2KI.dcm",
-        "RG1_J2KR.dcm",
-        "RG1_UNCI.dcm",
-        "RG3_J2KI.dcm",
-        "RG3_J2KI.dcm",
-        "RG3_J2KR.dcm",
-        "RG3_UNCI.dcm",
-        "SC_rgb_gdcm2k_uncompressed.dcm",
-        "US1_J2KI.dcm",
-        "US1_J2KR.dcm",
-        "US1_UNCI.dcm",
-        "test-SR.dcm",
-    ]
-
-    # TODO: Investigate why these fail deletion test of anonymization
-    deleted_failed = [
-        "color3d_jpeg_baseline.dcm",
-        "JPGLosslessP14SV1_1s_1f_8b.dcm",
-        "test-SR.dcm",
-    ]
-
-    # TODO: Investigate why these fail emptying test of anonymization
-    emptied_failed = [
-        "JPGLosslessP14SV1_1s_1f_8b.dcm",
-        "test-SR.dcm",
-    ]
-    return dcmread_failed + replaced_failed + deleted_failed + emptied_failed
+    return dcmread_failed
 
 
 @lru_cache(maxsize=None)
@@ -83,6 +40,7 @@ def get_passing_files():
 @pytest.fixture(scope="module", params=get_passing_files())
 def orig_anon_dataset(request):
     orig_ds = dcmread(request.param)
+    orig_ds.filename = None # Non-None value causes warnings in copy(). Not needed for this testing
     anon_ds = orig_ds.copy()
     anonymize_dataset(anon_ds)
     return (orig_ds, anon_ds)
@@ -91,34 +49,81 @@ def orig_anon_dataset(request):
 def test_deleted_tags_are_removed(orig_anon_dataset):
     orig_ds, anon_ds = orig_anon_dataset
     deleted_tags = dicomfields.X_TAGS
-    for tt in deleted_tags:
-        if len(tt) == 2 and tt in orig_ds:
-            assert tt not in anon_ds
 
+    for tt in deleted_tags:  # sourcery skip: no-loop-in-tests
+        if (
+            len(tt) == 2 and tt in orig_ds
+        ):  # sourcery skip: merge-nested-ifs, no-conditionals-in-tests
+            # TODO: Investigate why Date type are replaced instead of deleted
+            # See issue https://github.com/KitwareMedical/dicom-anonymizer/issues/56
+            if orig_ds[tt].VR != "DA":  # sourcery skip: no-conditionals-in-tests
+                assert (
+                    tt not in anon_ds
+                ), f"({tt[0]:04X},{tt[1]:04x}):{orig_ds[tt].value}->{anon_ds[tt].value}"
 
-def test_changed_tags_are_replaced(orig_anon_dataset):
-    changed_tags = (
-        dicomfields.U_TAGS
-        + dicomfields.D_TAGS
-        + dicomfields.Z_D_TAGS
-        + dicomfields.X_D_TAGS
-        + dicomfields.X_Z_D_TAGS
-        + dicomfields.X_Z_U_STAR_TAGS
-    )
 
+changed_tags = (
+    dicomfields.U_TAGS
+    + dicomfields.D_TAGS
+    + dicomfields.Z_D_TAGS
+    + dicomfields.X_D_TAGS
+    + dicomfields.X_Z_D_TAGS
+    + dicomfields.X_Z_U_STAR_TAGS
+)
+
+empty_values = (0, "", "00010101", "000000.00", "ANONYMIZED")
+
+
+def is_elem_replaced(orig, anon) -> bool:
+    if orig.VR == "SQ":
+        for x, y in zip(orig.value, anon.value):
+            for tt in changed_tags:
+                if tt in x and len(x[tt].value) > 0:
+                    assert tt in y, f"({tt[0]:04X},{tt[1]:04x}):{x[tt].value}->missing!"
+                    assert is_elem_replaced(
+                        x[tt], y[tt]
+                    ), f"({tt[0]:04X},{tt[1]:04x}):{x[tt].value} not replaced"
+        return True
+
+    return orig.value != anon.value if orig.value not in empty_values else True
+
+
+def test_changed_tags_are_replaced(orig_anon_dataset):
     orig_ds, anon_ds = orig_anon_dataset
 
-    for tt in changed_tags:
-        if tt in orig_ds:
-            assert anon_ds[tt] != orig_ds[tt]
+    for tt in changed_tags:  # sourcery skip: no-loop-in-tests
+        if tt in orig_ds:  # sourcery skip: no-conditionals-in-tests
+            assert (
+                tt in anon_ds
+            ), f"({tt[0]:04X},{tt[1]:04x}):{orig_ds[tt].value}->missing!"
+            assert is_elem_replaced(
+                orig_ds[tt], anon_ds[tt]
+            ), f"({tt[0]:04X},{tt[1]:04x}):{orig_ds[tt].value} not replaced"
 
 
-def test_empty_tags_are_emptied(orig_anon_dataset):
-    empty_values = (0, "", "00010101", "000000.00")
-    empty_tags = dicomfields.Z_TAGS + dicomfields.X_Z_TAGS
+empty_tags = dicomfields.Z_TAGS + dicomfields.X_Z_TAGS
+
 
+def is_elem_empty(elem) -> bool:
+    if elem.VR == "SQ":
+        for x in elem.value:
+            for tt in empty_tags:
+                if tt in x and len(x[tt].value) > 0:
+                    assert is_elem_empty(
+                        x[tt]
+                    ), f"({tt[0]:04X},{tt[1]:04x}):{x[tt].value} is not empty"
+        return True
+
+    return elem.value in empty_values
+
+
+def test_empty_tags_are_emptied(orig_anon_dataset):
     orig_ds, anon_ds = orig_anon_dataset
 
-    for tt in empty_tags:
-        if tt in orig_ds:
-            assert anon_ds[tt].value in empty_values
+    for tt in empty_tags:  # sourcery skip: no-loop-in-tests
+        if (
+            tt in orig_ds and len(orig_ds[tt].value) > 0
+        ):  # sourcery skip: no-conditionals-in-tests
+            assert is_elem_empty(
+                anon_ds[tt]
+            ), f"({tt[0]:04X},{tt[1]:04x}):{anon_ds[tt].value} is not empty"