Merge pull request #96 from ImagingDataCommons/add-smart-download

enh: add zero parameter download cli
ImagingDataCommons · Jul 4, 2024 · fa56355 · fa56355
2 parents 965730b + 3338aa9
commit fa56355
Show file tree

Hide file tree

Showing 3 changed files with 102 additions and 11 deletions.
diff --git a/idc_index/cli.py b/idc_index/cli.py
@@ -5,6 +5,7 @@
 from __future__ import annotations
 
 import logging
+from pathlib import Path
 
 import click
 
@@ -14,7 +15,7 @@
 # Set up logging for the CLI module
 logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG)
 logger_cli = logging.getLogger("cli")
-logger_cli.setLevel("WARNING")
+logger_cli.setLevel(logging.INFO)
 
 
 @click.group()
@@ -38,6 +39,7 @@ def set_log_level(log_level):
     logging_level = log_levels.get(log_level.lower(), logging.WARNING)
     logger_cli.debug(f"Setting the log level of index.py to {logging_level}")
     index.logger.setLevel(logging_level)
+    logger_cli.setLevel(logging_level)
 
 
 @main.command()
@@ -128,6 +130,7 @@ def download_from_selection(
     set_log_level(log_level)
     # Create an instance of the IDCClient
     client = IDCClient()
+    logger_cli.info(f"Downloading from IDC {client.get_idc_version()} index")
     # Parse the input parameters and pass them to IDCClient's download_from_selection method
     collection_id = (
         [cid.strip() for cid in (",".join(collection_id)).split(",")]
@@ -236,6 +239,7 @@ def download_from_manifest(
     set_log_level(log_level)
     # Create an instance of the IDCClient
     client = IDCClient()
+    logger_cli.info(f"Downloading from IDC {client.get_idc_version()} index")
     logger_cli.debug("Inputs received from cli manifest download:")
     logger_cli.debug(f"manifest_file_path: {manifest_file}")
     logger_cli.debug(f"download_dir: {download_dir}")
@@ -253,5 +257,81 @@ def download_from_manifest(
     )
 
 
+@main.command()
+@click.argument(
+    "generic_argument",
+    type=str,
+)
+@click.option(
+    "--log-level",
+    type=click.Choice(
+        ["debug", "info", "warning", "error", "critical"], case_sensitive=False
+    ),
+    default="info",
+    help="Set the logging level for the CLI module.",
+)
+def download(generic_argument, log_level):
+    """Download content given the input parameter.
+
+    Determine whether the input parameter corresponds to a file manifest or a list of collection_id, PatientID, StudyInstanceUID, or SeriesInstanceUID values, and download the corresponding files into the current directory. Default parameters will be used for organizing the downloaded files into folder hierarchy. Use `download_from_selection()` and `download_from_manifest()` functions if granular control over the download process is needed.
+    """
+    # Set the logging level for the CLI module
+    set_log_level(log_level)
+    # Create an instance of the IDCClient
+    client = IDCClient()
+
+    logger_cli.info(f"Downloading from IDC {client.get_idc_version()} index")
+
+    download_dir = Path.cwd()
+
+    if Path(generic_argument).is_file():
+        # Parse the input parameters and pass them to IDC
+        logger_cli.info("Detected manifest file, downloading from manifest.")
+        client.download_from_manifest(generic_argument, downloadDir=download_dir)
+    # this is not a file manifest
+    else:
+        # Split the input string and filter out any empty values
+        item_ids = [item for item in generic_argument.split(",") if item]
+
+        if not item_ids:
+            logger_cli.error("No valid IDs provided.")
+
+        index_df = client.index
+
+        def check_and_download(column_name, item_ids, download_dir, kwarg_name):
+            matches = index_df[column_name].isin(item_ids)
+            matched_ids = index_df[column_name][matches].unique().tolist()
+            if not matched_ids:
+                return False
+            unmatched_ids = list(set(item_ids) - set(matched_ids))
+            if unmatched_ids:
+                logger_cli.debug(
+                    f"Partial match for {column_name}: matched {matched_ids}, unmatched {unmatched_ids}"
+                )
+            logger_cli.info(f"Identified matching {column_name}: {matched_ids}")
+            client.download_from_selection(
+                **{kwarg_name: matched_ids, "downloadDir": download_dir}
+            )
+            return True
+
+        matches_found = 0
+        matches_found += check_and_download(
+            "collection_id", item_ids, download_dir, "collection_id"
+        )
+        matches_found += check_and_download(
+            "PatientID", item_ids, download_dir, "patientId"
+        )
+        matches_found += check_and_download(
+            "StudyInstanceUID", item_ids, download_dir, "studyInstanceUID"
+        )
+        matches_found += check_and_download(
+            "SeriesInstanceUID", item_ids, download_dir, "seriesInstanceUID"
+        )
+        if not matches_found:
+            logger_cli.error(
+                "None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID."
+            )
+
+
 if __name__ == "__main__":
     main()
diff --git a/idc_index/index.py b/idc_index/index.py
@@ -781,10 +781,10 @@ def _track_download_progress(
         show_progress_bar: bool = True,
         list_of_directories=None,
     ):
-        logger.info("Inputs received for tracking download:")
-        logger.info(f"size_MB: {size_MB}")
-        logger.info(f"downloadDir: {downloadDir}")
-        logger.info(f"show_progress_bar: {show_progress_bar}")
+        logger.debug("Inputs received for tracking download:")
+        logger.debug(f"size_MB: {size_MB}")
+        logger.debug(f"downloadDir: {downloadDir}")
+        logger.debug(f"show_progress_bar: {show_progress_bar}")
 
         runtime_errors = []
 
@@ -798,7 +798,7 @@ def _track_download_progress(
 
             logger.info("Initial size of the directory: %s bytes", initial_size_bytes)
             logger.info(
-                "Approx. Size of the files need to be downloaded: %s bytes",
+                "Approximate size of the files that need to be downloaded: %s bytes",
                 total_size_bytes,
             )
 
@@ -902,7 +902,7 @@ def _parse_s5cmd_sync_output_and_generate_synced_manifest(
         sync_size = merged_df["series_size_MB"].sum()
         sync_size_rounded = round(sync_size, 2)
 
-        logger.info(f"sync_size_rounded: {sync_size_rounded}")
+        logger.debug(f"sync_size_rounded: {sync_size_rounded}")
 
         if dirTemplate is not None:
             hierarchy = self._generate_sql_concat_for_building_directory(
@@ -1050,11 +1050,11 @@ def _s5cmd_run(
                     if sync_size < total_size:
                         logger.info(
                             """
-Destination folder is not empty and sync size is less than total size. Displaying a warning
+Destination folder is not empty and sync size is less than total size.
 """
                         )
                         existing_data_size = round(total_size - sync_size, 2)
-                        logger.warning(
+                        logger.info(
                             f"Requested total download size is {total_size} MB, \
                                     however at least {existing_data_size} MB is already present,\
                                     so downloading only remaining upto {sync_size} MB\n\
@@ -1075,7 +1075,7 @@ def _s5cmd_run(
                 )
         else:
             logger.info(
-                "Not using s5cmd sync dry run as the destination folder is empty or sync dry or progress bar is not requested"
+                "Not using s5cmd sync as the destination folder is empty or sync or progress bar is not requested"
             )
             cmd = [
                 self.s5cmdPath,
@@ -1430,7 +1430,7 @@ def download_from_selection(
                 list_of_directories = result_df.path.to_list()
             else:
                 list_of_directories = [downloadDir]
-        logger.info(
+        logger.debug(
             """
 Temporary download manifest is generated and is passed to self._s5cmd_run
 """

diff --git a/tests/idcindex.py b/tests/idcindex.py
@@ -5,6 +5,7 @@
 import tempfile
 import unittest
 from itertools import product
+from pathlib import Path
 
 import pandas as pd
 import pytest
@@ -27,6 +28,7 @@ def setUp(self):
         self.client = index.IDCClient()
         self.download_from_manifest = cli.download_from_manifest
         self.download_from_selection = cli.download_from_selection
+        self.download = cli.download
 
         logger = logging.getLogger("idc_index")
         logger.setLevel(logging.DEBUG)
@@ -421,6 +423,15 @@ def test_cli_download_from_manifest(self):
             )
             assert len(os.listdir(temp_dir)) != 0
 
+    def test_cli_download(self):
+        runner = CliRunner()
+        with runner.isolated_filesystem():
+            result = runner.invoke(
+                self.download,
+                ["1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462"],
+            )
+            assert len(os.listdir(Path.cwd())) != 0
+
 
 if __name__ == "__main__":
     unittest.main()