Skip to content

Commit

Permalink
Merge pull request #96 from ImagingDataCommons/add-smart-download
Browse files Browse the repository at this point in the history
enh: add zero parameter download cli
  • Loading branch information
fedorov authored Jul 4, 2024
2 parents 965730b + 3338aa9 commit fa56355
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 11 deletions.
82 changes: 81 additions & 1 deletion idc_index/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from __future__ import annotations

import logging
from pathlib import Path

import click

Expand All @@ -14,7 +15,7 @@
# Set up logging for the CLI module
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG)
logger_cli = logging.getLogger("cli")
logger_cli.setLevel("WARNING")
logger_cli.setLevel(logging.INFO)


@click.group()
Expand All @@ -38,6 +39,7 @@ def set_log_level(log_level):
logging_level = log_levels.get(log_level.lower(), logging.WARNING)
logger_cli.debug(f"Setting the log level of index.py to {logging_level}")
index.logger.setLevel(logging_level)
logger_cli.setLevel(logging_level)


@main.command()
Expand Down Expand Up @@ -128,6 +130,7 @@ def download_from_selection(
set_log_level(log_level)
# Create an instance of the IDCClient
client = IDCClient()
logger_cli.info(f"Downloading from IDC {client.get_idc_version()} index")
# Parse the input parameters and pass them to IDCClient's download_from_selection method
collection_id = (
[cid.strip() for cid in (",".join(collection_id)).split(",")]
Expand Down Expand Up @@ -236,6 +239,7 @@ def download_from_manifest(
set_log_level(log_level)
# Create an instance of the IDCClient
client = IDCClient()
logger_cli.info(f"Downloading from IDC {client.get_idc_version()} index")
logger_cli.debug("Inputs received from cli manifest download:")
logger_cli.debug(f"manifest_file_path: {manifest_file}")
logger_cli.debug(f"download_dir: {download_dir}")
Expand All @@ -253,5 +257,81 @@ def download_from_manifest(
)


@main.command()
@click.argument(
"generic_argument",
type=str,
)
@click.option(
"--log-level",
type=click.Choice(
["debug", "info", "warning", "error", "critical"], case_sensitive=False
),
default="info",
help="Set the logging level for the CLI module.",
)
def download(generic_argument, log_level):
"""Download content given the input parameter.
Determine whether the input parameter corresponds to a file manifest or a list of collection_id, PatientID, StudyInstanceUID, or SeriesInstanceUID values, and download the corresponding files into the current directory. Default parameters will be used for organizing the downloaded files into folder hierarchy. Use `download_from_selection()` and `download_from_manifest()` functions if granular control over the download process is needed.
"""
# Set the logging level for the CLI module
set_log_level(log_level)
# Create an instance of the IDCClient
client = IDCClient()

logger_cli.info(f"Downloading from IDC {client.get_idc_version()} index")

download_dir = Path.cwd()

if Path(generic_argument).is_file():
# Parse the input parameters and pass them to IDC
logger_cli.info("Detected manifest file, downloading from manifest.")
client.download_from_manifest(generic_argument, downloadDir=download_dir)
# this is not a file manifest
else:
# Split the input string and filter out any empty values
item_ids = [item for item in generic_argument.split(",") if item]

if not item_ids:
logger_cli.error("No valid IDs provided.")

index_df = client.index

def check_and_download(column_name, item_ids, download_dir, kwarg_name):
matches = index_df[column_name].isin(item_ids)
matched_ids = index_df[column_name][matches].unique().tolist()
if not matched_ids:
return False
unmatched_ids = list(set(item_ids) - set(matched_ids))
if unmatched_ids:
logger_cli.debug(
f"Partial match for {column_name}: matched {matched_ids}, unmatched {unmatched_ids}"
)
logger_cli.info(f"Identified matching {column_name}: {matched_ids}")
client.download_from_selection(
**{kwarg_name: matched_ids, "downloadDir": download_dir}
)
return True

matches_found = 0
matches_found += check_and_download(
"collection_id", item_ids, download_dir, "collection_id"
)
matches_found += check_and_download(
"PatientID", item_ids, download_dir, "patientId"
)
matches_found += check_and_download(
"StudyInstanceUID", item_ids, download_dir, "studyInstanceUID"
)
matches_found += check_and_download(
"SeriesInstanceUID", item_ids, download_dir, "seriesInstanceUID"
)
if not matches_found:
logger_cli.error(
"None of the values passed matched any of the identifiers: collection_id, PatientID, StudyInstanceUID, SeriesInstanceUID."
)


if __name__ == "__main__":
main()
20 changes: 10 additions & 10 deletions idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,10 +781,10 @@ def _track_download_progress(
show_progress_bar: bool = True,
list_of_directories=None,
):
logger.info("Inputs received for tracking download:")
logger.info(f"size_MB: {size_MB}")
logger.info(f"downloadDir: {downloadDir}")
logger.info(f"show_progress_bar: {show_progress_bar}")
logger.debug("Inputs received for tracking download:")
logger.debug(f"size_MB: {size_MB}")
logger.debug(f"downloadDir: {downloadDir}")
logger.debug(f"show_progress_bar: {show_progress_bar}")

runtime_errors = []

Expand All @@ -798,7 +798,7 @@ def _track_download_progress(

logger.info("Initial size of the directory: %s bytes", initial_size_bytes)
logger.info(
"Approx. Size of the files need to be downloaded: %s bytes",
"Approximate size of the files that need to be downloaded: %s bytes",
total_size_bytes,
)

Expand Down Expand Up @@ -902,7 +902,7 @@ def _parse_s5cmd_sync_output_and_generate_synced_manifest(
sync_size = merged_df["series_size_MB"].sum()
sync_size_rounded = round(sync_size, 2)

logger.info(f"sync_size_rounded: {sync_size_rounded}")
logger.debug(f"sync_size_rounded: {sync_size_rounded}")

if dirTemplate is not None:
hierarchy = self._generate_sql_concat_for_building_directory(
Expand Down Expand Up @@ -1050,11 +1050,11 @@ def _s5cmd_run(
if sync_size < total_size:
logger.info(
"""
Destination folder is not empty and sync size is less than total size. Displaying a warning
Destination folder is not empty and sync size is less than total size.
"""
)
existing_data_size = round(total_size - sync_size, 2)
logger.warning(
logger.info(
f"Requested total download size is {total_size} MB, \
however at least {existing_data_size} MB is already present,\
so downloading only remaining upto {sync_size} MB\n\
Expand All @@ -1075,7 +1075,7 @@ def _s5cmd_run(
)
else:
logger.info(
"Not using s5cmd sync dry run as the destination folder is empty or sync dry or progress bar is not requested"
"Not using s5cmd sync as the destination folder is empty or sync or progress bar is not requested"
)
cmd = [
self.s5cmdPath,
Expand Down Expand Up @@ -1430,7 +1430,7 @@ def download_from_selection(
list_of_directories = result_df.path.to_list()
else:
list_of_directories = [downloadDir]
logger.info(
logger.debug(
"""
Temporary download manifest is generated and is passed to self._s5cmd_run
"""
Expand Down
11 changes: 11 additions & 0 deletions tests/idcindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import tempfile
import unittest
from itertools import product
from pathlib import Path

import pandas as pd
import pytest
Expand All @@ -27,6 +28,7 @@ def setUp(self):
self.client = index.IDCClient()
self.download_from_manifest = cli.download_from_manifest
self.download_from_selection = cli.download_from_selection
self.download = cli.download

logger = logging.getLogger("idc_index")
logger.setLevel(logging.DEBUG)
Expand Down Expand Up @@ -421,6 +423,15 @@ def test_cli_download_from_manifest(self):
)
assert len(os.listdir(temp_dir)) != 0

def test_cli_download(self):
runner = CliRunner()
with runner.isolated_filesystem():
result = runner.invoke(
self.download,
["1.3.6.1.4.1.14519.5.2.1.7695.1700.114861588187429958687900856462"],
)
assert len(os.listdir(Path.cwd())) != 0


if __name__ == "__main__":
unittest.main()

0 comments on commit fa56355

Please sign in to comment.