From 8e64becf0c8af0020d974b710945101d7cb83671 Mon Sep 17 00:00:00 2001 From: Vamsi Thiriveedhi Date: Sat, 4 May 2024 20:51:35 +0000 Subject: [PATCH 1/2] feat: provide a command line option to download from a selection or a manifest using click module. Uses 'idc' as 'binary' but can be configured to anything the two options download from selection and download from manifest are direct mirrors of index.py --log-level option is introduced to cli, to control the log level of index.py --- idc_index/cli.py | 257 +++++++++++++++++++++++++++++++++++++++++++++ idc_index/index.py | 6 +- pyproject.toml | 4 + tests/idcindex.py | 49 ++++++++- 4 files changed, 311 insertions(+), 5 deletions(-) create mode 100644 idc_index/cli.py diff --git a/idc_index/cli.py b/idc_index/cli.py new file mode 100644 index 00000000..7c94797d --- /dev/null +++ b/idc_index/cli.py @@ -0,0 +1,257 @@ +"""CLI module for the IDC client. + +This module provides command-line interface (CLI) commands to interact with the Imaging Data Commons (IDC) data. +""" +from __future__ import annotations + +import logging + +import click + +from . import index +from .index import IDCClient + +# Set up logging for the CLI module +logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG) +logger_cli = logging.getLogger("cli") +logger_cli.setLevel("WARNING") + + +@click.group() +def main(): + """Idc is a command line client to help download data from Imaging Data Commons.""" + + +def set_log_level(log_level): + """Set the logging level for the CLI module. + + Args: + log_level (str): The logging level to set. + """ + log_levels = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL, + } + logging_level = log_levels.get(log_level.lower(), logging.WARNING) + logger_cli.debug(f"Setting the log level of index.py to {logging_level}") + index.logger.setLevel(logging_level) + + +@main.command() +@click.option( + "--download-dir", + required=True, + type=click.Path(), + help="Path to the directory to download the files to.", +) +@click.option( + "--dry-run", + type=bool, + default=False, + help="If set, calculates the size of the cohort but download does not start.", +) +@click.option( + "--collection-id", + type=str, + multiple=True, + default=None, + help="Collection ID(s) to filter by.", +) +@click.option( + "--patient-id", + type=str, + multiple=True, + default=None, + help="Patient ID(s) to filter by.", +) +@click.option( + "--study-instance-uid", + type=str, + multiple=True, + default=None, + help="DICOM StudyInstanceUID(s) to filter by.", +) +@click.option( + "--series-instance-uid", + type=str, + multiple=True, + default=None, + help="DICOM SeriesInstanceUID(s) to filter by.", +) +@click.option( + "--quiet", + type=bool, + default=True, + help="If set, suppresses the output of the subprocess.", +) +@click.option( + "--show-progress-bar", + type=bool, + default=True, + help="If set, tracks the progress of download.", +) +@click.option( + "--use-s5cmd-sync", + type=bool, + default=False, + help="If set, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded.", +) +@click.option( + "--log-level", + type=click.Choice( + ["debug", "info", "warning", "error", "critical"], case_sensitive=False + ), + default="info", + help="Set the logging level for the CLI module.", +) +def download_from_selection( + download_dir, + dry_run, + collection_id, + patient_id, + study_instance_uid, + series_instance_uid, + quiet, + show_progress_bar, + use_s5cmd_sync, + log_level, +): + """Download from a selection of collection(s), patient(s), study(studies) and series. + + The filtering will be applied in sequence by first selecting the collection(s), followed by + patient(s), study(studies) and series. If no filtering is applied, all the files will be downloaded. + """ + # Set the logging level for the CLI module + set_log_level(log_level) + # Create an instance of the IDCClient + client = IDCClient() + # Parse the input parameters and pass them to IDCClient's download_from_selection method + collection_id = ( + [cid.strip() for cid in (",".join(collection_id)).split(",")] + if collection_id + else None + ) + patient_id = ( + [pid.strip() for pid in (",".join(patient_id)).split(",")] + if patient_id + else None + ) + study_instance_uid = ( + [uid.strip() for uid in (",".join(study_instance_uid)).split(",")] + if study_instance_uid + else None + ) + series_instance_uid = ( + [uid.strip() for uid in (",".join(series_instance_uid)).split(",")] + if series_instance_uid + else None + ) + logger_cli.debug("Inputs received from cli download:") + logger_cli.debug(f"collection_id: {collection_id}") + logger_cli.debug(f"patient_id: {patient_id}") + logger_cli.debug(f"study_instance_uid: {study_instance_uid}") + logger_cli.debug(f"series_instance_uid: {series_instance_uid}") + logger_cli.debug(f"dry_run: {dry_run}") + logger_cli.debug(f"quiet: {quiet}") + logger_cli.debug(f"show_progress_bar: {show_progress_bar}") + logger_cli.debug(f"use_s5cmd_sync: {use_s5cmd_sync}") + + client.download_from_selection( + download_dir, + dry_run=dry_run, + collection_id=collection_id, + patientId=patient_id, + studyInstanceUID=study_instance_uid, + seriesInstanceUID=series_instance_uid, + quiet=quiet, + show_progress_bar=show_progress_bar, + use_s5cmd_sync=use_s5cmd_sync, + ) + + +@main.command() +@click.option( + "--manifest-file", + required=True, + type=click.Path(), + help="The path to the manifest file.", +) +@click.option( + "--download-dir", + required=True, + type=click.Path(), + help="Path to the directory to download the files to.", +) +@click.option( + "--quiet", + type=bool, + default=True, + help="If set, suppresses the output of the subprocess.", +) +@click.option( + "--validate-manifest", + type=bool, + default=True, + help="If True, validates the manifest for any errors. Defaults to True.", +) +@click.option( + "--show-progress-bar", + type=bool, + default=True, + help="If set, tracks the progress of download.", +) +@click.option( + "--use-s5cmd-sync", + type=bool, + default=False, + help="If set, will use s5cmd sync operation instead of cp when downloadDirectory is not empty; this can significantly improve the download speed if the content is partially downloaded.", +) +@click.option( + "--log-level", + type=click.Choice( + ["debug", "info", "warning", "error", "critical"], case_sensitive=False + ), + default="info", + help="Set the logging level for the CLI module.", +) +def download_from_manifest( + manifest_file, + download_dir, + quiet, + validate_manifest, + show_progress_bar, + use_s5cmd_sync, + log_level, +): + """Download the manifest file. + + In a series of steps, the manifest file is first validated to ensure every line contains a valid URL. + It then gets the total size to be downloaded and runs the download process on one + process and download progress on another process. + """ + # Set the logging level for the CLI module + set_log_level(log_level) + # Create an instance of the IDCClient + client = IDCClient() + logger_cli.debug("Inputs received from cli manifest download:") + logger_cli.debug(f"manifest_file_path: {manifest_file}") + logger_cli.debug(f"download_dir: {download_dir}") + logger_cli.debug(f"validate_manifest: {validate_manifest}") + logger_cli.debug(f"show_progress_bar: {show_progress_bar}") + logger_cli.debug(f"use_s5cmd_sync: {use_s5cmd_sync}") + # Call IDCClient's download_from_manifest method with the provided parameters + client.download_from_manifest( + manifestFile=manifest_file, + downloadDir=download_dir, + quiet=quiet, + validate_manifest=validate_manifest, + show_progress_bar=show_progress_bar, + use_s5cmd_sync=use_s5cmd_sync, + ) + + +if __name__ == "__main__": + main() diff --git a/idc_index/index.py b/idc_index/index.py index 3684fcd1..297b7d93 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -19,12 +19,12 @@ from packaging.version import Version from tqdm import tqdm -logger = logging.getLogger(__name__) -logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO) - aws_endpoint_url = "https://s3.amazonaws.com" gcp_endpoint_url = "https://storage.googleapis.com" +logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG) +logger = logging.getLogger(__name__) + class IDCClient: # Default download hierarchy template diff --git a/pyproject.toml b/pyproject.toml index 2a8faaa3..fe214e9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ classifiers = [ ] dynamic = ["version"] dependencies = [ + "click", 'duckdb>=0.10.0', "idc-index-data==18.0.1", "packaging", @@ -60,6 +61,9 @@ docs = [ "furo>=2023.08.17", ] +[project.scripts] +idc = 'idc_index.cli:main' + [project.urls] Homepage = "https://github.com/ImagingDataCommons/idc-index" "Bug Tracker" = "https://github.com/ImagingDataCommons/idc-index/issues" diff --git a/tests/idcindex.py b/tests/idcindex.py index 7f33f5f6..4c4df253 100644 --- a/tests/idcindex.py +++ b/tests/idcindex.py @@ -8,12 +8,13 @@ import pandas as pd import pytest -from idc_index import index +from click.testing import CliRunner +from idc_index import cli, index # Run tests using the following command from the root of the repository: # python -m unittest -vv tests/idcindex.py -logging.basicConfig(level=logging.INFO) +logging.basicConfig(level=logging.DEBUG) @pytest.fixture(autouse=True) @@ -24,6 +25,8 @@ def _change_test_dir(request, monkeypatch): class TestIDCClient(unittest.TestCase): def setUp(self): self.client = index.IDCClient() + self.download_from_manifest = cli.download_from_manifest + self.download_from_selection = cli.download_from_selection logger = logging.getLogger("idc_index") logger.setLevel(logging.DEBUG) @@ -370,6 +373,48 @@ def test_citations(self): ) self.assertIsNotNone(citations) + def test_cli_download_from_selection(self): + runner = CliRunner() + with tempfile.TemporaryDirectory() as temp_dir: + result = runner.invoke( + self.download_from_selection, + [ + "--download-dir", + temp_dir, + "--dry-run", + False, + "--quiet", + True, + "--show-progress-bar", + True, + "--use-s5cmd-sync", + False, + "--study-instance-uid", + "1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462", + ], + ) + assert len(os.listdir(temp_dir)) != 0 + + def test_cli_download_from_manifest(self): + runner = CliRunner() + with tempfile.TemporaryDirectory() as temp_dir: + result = runner.invoke( + self.download_from_manifest, + [ + "--manifest-file", + "./study_manifest_aws.s5cmd", + "--download-dir", + temp_dir, + "--quiet", + True, + "--show-progress-bar", + True, + "--use-s5cmd-sync", + False, + ], + ) + assert len(os.listdir(temp_dir)) != 0 + if __name__ == "__main__": unittest.main() From 76ad79bc08a8c15285256c55e114beca10dcb440 Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Wed, 29 May 2024 03:54:05 -0400 Subject: [PATCH 2/2] enh: change logging back to INFO --- idc_index/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idc_index/index.py b/idc_index/index.py index 297b7d93..0edf02f8 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -22,7 +22,7 @@ aws_endpoint_url = "https://s3.amazonaws.com" gcp_endpoint_url = "https://storage.googleapis.com" -logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG) +logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO) logger = logging.getLogger(__name__)