From e4a54297c63d0dc8f691645cef331d6388c2c5c0 Mon Sep 17 00:00:00 2001
From: Paul Choisel <paul.choisel@kitware.com>
Date: Mon, 15 Jan 2024 13:31:24 +0100
Subject: [PATCH] ENH: Add dicom fields scrapping script

---
 .gitignore                    |   3 +-
 pyproject.toml                |   3 +
 scripts/README.md             |  12 ++++
 scripts/scrap_DICOM_fields.py | 111 ++++++++++++++++++++++++++++++++++
 4 files changed, 128 insertions(+), 1 deletion(-)
 create mode 100644 scripts/README.md
 create mode 100644 scripts/scrap_DICOM_fields.py

diff --git a/.gitignore b/.gitignore
index 529c1b9..5fe5836 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,5 @@ env
 __pycache__
 .vscode
 build
-*.egg-info
\ No newline at end of file
+*.egg-info
+.python-version
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 026195b..886926e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,9 @@ dependencies = [
 dev = [
   "pytest",
   "setuptools", # Needed to load pydicom's test files
+  "bs4",
+  "fire",
+  "requests"
 ]
 
 [project.scripts]
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 0000000..1a11a6e
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,12 @@
+# Script folder
+
+This folder contains utility scripts for the maintenance of the package.
+
+## scrap_DICOM_fields.py
+
+This script downloads a web page and tries to scrap the DICOM fields and their anonymization command from it.
+
+1. Pull the repository: `git clone https://github.com/KitwareMedical/dicom-anonymizer.git`
+1. Go in the repository: `cd dicom-anonymizer`
+1. Install the dependencies: `pip install -e '.[dev]'`
+1. Run the script: `python scripts/scrap_DICOM_fields.py` (Run it with `-h` to get a list of arguments)
\ No newline at end of file
diff --git a/scripts/scrap_DICOM_fields.py b/scripts/scrap_DICOM_fields.py
new file mode 100644
index 0000000..66f742f
--- /dev/null
+++ b/scripts/scrap_DICOM_fields.py
@@ -0,0 +1,111 @@
+"""
+Download a web page and try to scrap the DICOM fields and their anonymization command from it.
+
+Written by Mohammad Khawar Zia
+"""
+
+import fire
+import requests
+
+from collections import defaultdict
+from bs4 import BeautifulSoup
+
+
+dicom_fields_header = """# Tags anonymized in DICOM standard
+# Documentation for groups meaning can be found in default associated actions.
+# https://dicom.nema.org/medical/dicom/current/output/chtml/part15/chapter_e.html
+
+"""
+
+dicom_fields_footer = """# Contains all previous tags into one array
+ALL_TAGS = []
+ALL_TAGS.extend(D_TAGS)
+ALL_TAGS.extend(Z_TAGS)
+ALL_TAGS.extend(X_TAGS)
+ALL_TAGS.extend(U_TAGS)
+ALL_TAGS.extend(Z_D_TAGS)
+ALL_TAGS.extend(X_Z_TAGS)
+ALL_TAGS.extend(X_D_TAGS)
+ALL_TAGS.extend(X_Z_D_TAGS)
+ALL_TAGS.extend(X_Z_U_STAR_TAGS)
+"""
+
+
+def scrap_profiles(url):
+    page = requests.get(url)
+    soup = BeautifulSoup(page.content, "html.parser")
+
+    headers = [th.text for th in soup.find(attrs={'id': 'table_E.1-1'}).parent.find('table').find('thead').find_all('strong')]
+    data = []
+
+
+    for tr in soup.find(attrs={'id': 'table_E.1-1'}).parent.find('table').find('tbody').find_all('tr'):
+        tmp = {key: value.text.strip() for key, value in dict(zip(headers, tr.find_all('td'))).items() if key in ['Attribute Name', 'Tag', 'Basic Prof.']}
+        tmp2 = (tmp.get('Tag'), tmp.get('Attribute Name'), tmp.get('Basic Prof.'))
+        data.append(tmp2)
+
+    data = sorted(data, key=lambda ele: (ele[2], ele[1]))
+
+
+    profiles = defaultdict(list)
+    fields_to_skip = {
+        'Private Attributes',
+    }
+    for tag, name, profile in data:
+        if name in fields_to_skip:
+            continue
+
+        if name == 'Curve Data':
+            new_tag = '(0x5000, 0x0000, 0xFF00, 0x0000)'
+        elif name == 'Overlay Comments':
+            new_tag = '(0x6000, 0x4000, 0xFF00, 0xFFFF)'
+        elif name == 'Overlay Data':
+            new_tag = '(0x6000, 0x3000, 0xFF00, 0xFFFF)'
+        else:
+            new_tag = list(tag)
+            new_tag.insert(6, '0x')
+            new_tag.insert(6, ' ')
+            new_tag.insert(1, '0x')
+            new_tag = ''.join(new_tag)
+
+        name = name.replace('\u200b', '').replace('\n', '')
+        string = f'{new_tag}, # {name}'
+        profiles[profile].append(string)
+
+    return profiles
+
+
+def create_DICOM_fields(profiles):
+    dicom_fields = ""
+    for tag, tag_list, comment in (
+        ('D', 'D_TAGS', '# Replaced tags'),
+        ('Z', 'Z_TAGS', "# Replaced with empty values (0, '', ...)"),
+        ('X', 'X_TAGS', '# Deleted tags'),
+        ('U', 'U_TAGS', '# Replace UID'),
+
+        ('Z/D', 'Z_D_TAGS', '# Replace element according to the VR'),
+        ('X/Z', 'X_Z_TAGS', '# Set the value to empty according to the VR'),
+        ('X/D', 'X_D_TAGS', "# Replace element according to the VR"),
+
+        ('X/Z/D', 'X_Z_D_TAGS', '# Replace element according to the VR'),
+        ('X/Z/U*', 'X_Z_U_STAR_TAGS',
+        '# Replace element with UI as VR, else replace according to VR with empty values'),
+    ):
+        dicom_fields += f'{comment}\n{tag_list} = [\n'
+        for profile in profiles.get(tag):
+            dicom_fields += f'    {profile}\n'
+        dicom_fields += ']\n\n'
+
+    return dicom_fields_header + dicom_fields + dicom_fields_footer
+
+
+def main(
+        url="https://dicom.nema.org/medical/dicom/current/output/chtml/part15/chapter_e.html",
+        output_path='dicomanonymizer/dicomfields.py'):
+    profiles = scrap_profiles(url)
+    file_content = create_DICOM_fields(profiles=profiles)
+    with open(output_path, 'w') as file:
+        file.write(file_content)
+
+if __name__ == '__main__':
+  fire.Fire(main)
\ No newline at end of file