From e4a54297c63d0dc8f691645cef331d6388c2c5c0 Mon Sep 17 00:00:00 2001 From: Paul Choisel Date: Mon, 15 Jan 2024 13:31:24 +0100 Subject: [PATCH] ENH: Add dicom fields scrapping script --- .gitignore | 3 +- pyproject.toml | 3 + scripts/README.md | 12 ++++ scripts/scrap_DICOM_fields.py | 111 ++++++++++++++++++++++++++++++++++ 4 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 scripts/README.md create mode 100644 scripts/scrap_DICOM_fields.py diff --git a/.gitignore b/.gitignore index 529c1b9..5fe5836 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ env __pycache__ .vscode build -*.egg-info \ No newline at end of file +*.egg-info +.python-version \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 026195b..886926e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,9 @@ dependencies = [ dev = [ "pytest", "setuptools", # Needed to load pydicom's test files + "bs4", + "fire", + "requests" ] [project.scripts] diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..1a11a6e --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,12 @@ +# Script folder + +This folder contains utility scripts for the maintenance of the package. + +## scrap_DICOM_fields.py + +This script downloads a web page and tries to scrap the DICOM fields and their anonymization command from it. + +1. Pull the repository: `git clone https://github.com/KitwareMedical/dicom-anonymizer.git` +1. Go in the repository: `cd dicom-anonymizer` +1. Install the dependencies: `pip install -e '.[dev]'` +1. Run the script: `python scripts/scrap_DICOM_fields.py` (Run it with `-h` to get a list of arguments) \ No newline at end of file diff --git a/scripts/scrap_DICOM_fields.py b/scripts/scrap_DICOM_fields.py new file mode 100644 index 0000000..66f742f --- /dev/null +++ b/scripts/scrap_DICOM_fields.py @@ -0,0 +1,111 @@ +""" +Download a web page and try to scrap the DICOM fields and their anonymization command from it. + +Written by Mohammad Khawar Zia +""" + +import fire +import requests + +from collections import defaultdict +from bs4 import BeautifulSoup + + +dicom_fields_header = """# Tags anonymized in DICOM standard +# Documentation for groups meaning can be found in default associated actions. +# https://dicom.nema.org/medical/dicom/current/output/chtml/part15/chapter_e.html + +""" + +dicom_fields_footer = """# Contains all previous tags into one array +ALL_TAGS = [] +ALL_TAGS.extend(D_TAGS) +ALL_TAGS.extend(Z_TAGS) +ALL_TAGS.extend(X_TAGS) +ALL_TAGS.extend(U_TAGS) +ALL_TAGS.extend(Z_D_TAGS) +ALL_TAGS.extend(X_Z_TAGS) +ALL_TAGS.extend(X_D_TAGS) +ALL_TAGS.extend(X_Z_D_TAGS) +ALL_TAGS.extend(X_Z_U_STAR_TAGS) +""" + + +def scrap_profiles(url): + page = requests.get(url) + soup = BeautifulSoup(page.content, "html.parser") + + headers = [th.text for th in soup.find(attrs={'id': 'table_E.1-1'}).parent.find('table').find('thead').find_all('strong')] + data = [] + + + for tr in soup.find(attrs={'id': 'table_E.1-1'}).parent.find('table').find('tbody').find_all('tr'): + tmp = {key: value.text.strip() for key, value in dict(zip(headers, tr.find_all('td'))).items() if key in ['Attribute Name', 'Tag', 'Basic Prof.']} + tmp2 = (tmp.get('Tag'), tmp.get('Attribute Name'), tmp.get('Basic Prof.')) + data.append(tmp2) + + data = sorted(data, key=lambda ele: (ele[2], ele[1])) + + + profiles = defaultdict(list) + fields_to_skip = { + 'Private Attributes', + } + for tag, name, profile in data: + if name in fields_to_skip: + continue + + if name == 'Curve Data': + new_tag = '(0x5000, 0x0000, 0xFF00, 0x0000)' + elif name == 'Overlay Comments': + new_tag = '(0x6000, 0x4000, 0xFF00, 0xFFFF)' + elif name == 'Overlay Data': + new_tag = '(0x6000, 0x3000, 0xFF00, 0xFFFF)' + else: + new_tag = list(tag) + new_tag.insert(6, '0x') + new_tag.insert(6, ' ') + new_tag.insert(1, '0x') + new_tag = ''.join(new_tag) + + name = name.replace('\u200b', '').replace('\n', '') + string = f'{new_tag}, # {name}' + profiles[profile].append(string) + + return profiles + + +def create_DICOM_fields(profiles): + dicom_fields = "" + for tag, tag_list, comment in ( + ('D', 'D_TAGS', '# Replaced tags'), + ('Z', 'Z_TAGS', "# Replaced with empty values (0, '', ...)"), + ('X', 'X_TAGS', '# Deleted tags'), + ('U', 'U_TAGS', '# Replace UID'), + + ('Z/D', 'Z_D_TAGS', '# Replace element according to the VR'), + ('X/Z', 'X_Z_TAGS', '# Set the value to empty according to the VR'), + ('X/D', 'X_D_TAGS', "# Replace element according to the VR"), + + ('X/Z/D', 'X_Z_D_TAGS', '# Replace element according to the VR'), + ('X/Z/U*', 'X_Z_U_STAR_TAGS', + '# Replace element with UI as VR, else replace according to VR with empty values'), + ): + dicom_fields += f'{comment}\n{tag_list} = [\n' + for profile in profiles.get(tag): + dicom_fields += f' {profile}\n' + dicom_fields += ']\n\n' + + return dicom_fields_header + dicom_fields + dicom_fields_footer + + +def main( + url="https://dicom.nema.org/medical/dicom/current/output/chtml/part15/chapter_e.html", + output_path='dicomanonymizer/dicomfields.py'): + profiles = scrap_profiles(url) + file_content = create_DICOM_fields(profiles=profiles) + with open(output_path, 'w') as file: + file.write(file_content) + +if __name__ == '__main__': + fire.Fire(main) \ No newline at end of file