From 9e1817c6f2ba7cd46224018c3926cac8948ac37a Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Wed, 31 Jul 2024 13:49:53 -0400 Subject: [PATCH 1/6] ENH added functionality and tests to list available indices and fetch sm-specific ones. --- idc_index/index.py | 108 ++++++++++++++++++++++++++++++++++++++++++--- tests/idcindex.py | 10 +++++ 2 files changed, 111 insertions(+), 7 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index d51d4dc0..abe4787c 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -58,9 +58,8 @@ def client(cls) -> IDCClient: return cls._client def __init__(self): + # Read main index file file_path = idc_index_data.IDC_INDEX_PARQUET_FILEPATH - - # Read index file logger.debug(f"Reading index file v{idc_index_data.__version__}") self.index = pd.read_parquet(file_path) # self.index = self.index.astype(str).replace("nan", "") @@ -68,10 +67,10 @@ def __init__(self): self.collection_summary = self.index.groupby("collection_id").agg( {"Modality": pd.Series.unique, "series_size_MB": "sum"} ) + self.indices_overview = self.list_indices() # Lookup s5cmd self.s5cmdPath = shutil.which("s5cmd") - if self.s5cmdPath is None: # Workaround to support environment without a properly setup PATH # See https://github.com/Slicer/Slicer/pull/7587 @@ -80,16 +79,12 @@ def __init__(self): if str(script).startswith("s5cmd/bin/s5cmd"): self.s5cmdPath = script.locate().resolve(strict=True) break - if self.s5cmdPath is None: raise FileNotFoundError( "s5cmd executable not found. Please install s5cmd from https://github.com/peak/s5cmd#installation" ) - self.s5cmdPath = str(self.s5cmdPath) - logger.debug(f"Found s5cmd executable: {self.s5cmdPath}") - # ... and check it can be executed subprocess.check_call([self.s5cmdPath, "--help"], stdout=subprocess.DEVNULL) @@ -177,6 +172,105 @@ def get_idc_version(): idc_version = Version(idc_index_data.__version__).major return f"v{idc_version}" + @staticmethod + def _get_latest_idc_index_data_release_assets(): + """ + Retrieves a list of the latest idc-index-data release assets. + + Returns: + release_assets (list): List of tuples (asset_name, asset_url). + """ + release_assets = [] + url = f"https://api.github.com/repos/ImagingDataCommons/idc-index-data/releases/tags/{idc_index_data.__version__}" + try: + response = requests.get(url, timeout=30) + if response.status_code == 200: + release_data = response.json() + assets = release_data.get("assets", []) + for asset in assets: + release_assets.append( + (asset["name"], asset["browser_download_url"]) + ) + else: + logger.error(f"Failed to fetch releases: {response.status_code}") + + except FileNotFoundError: + logger.error(f"Failed to fetch releases: {response.status_code}") + + return release_assets + + def list_indices(self): + """ + Lists all available indices including their installation status. + + Returns: + indices_overview (pd.DataFrame): DataFrame containing information per index. + """ + + if "indices_overview" not in locals(): + indices_overview = {} + # Find installed indices + for file in distribution("idc-index-data").files: + if str(file).endswith("index.parquet"): + index_name = os.path.splitext( + str(file).rsplit("/", maxsplit=1)[-1] + )[0] + + indices_overview[index_name] = { + "description": None, + "installed": True, + "local_path": os.path.join( + idc_index_data.IDC_INDEX_PARQUET_FILEPATH.parents[0], + f"{index_name}.parquet", + ), + } + + # Find available indices from idc-index-data + release_assets = self._get_latest_idc_index_data_release_assets() + for asset_name, asset_url in release_assets: + if asset_name.endswith(".parquet"): + asset_name = os.path.splitext(asset_name)[0] + if asset_name not in indices_overview: + indices_overview[asset_name] = { + "description": None, + "installed": False, + "url": asset_url, + } + + self.indices_overview = pd.DataFrame.from_dict( + indices_overview, orient="index" + ) + + return self.indices_overview + + def fetch_index(self, index) -> None: + """ + Downloads requested index. + + Args: + index (str): Name of the index to be downloaded. + """ + + if index not in self.indices_overview.index.tolist(): + logger.error(f"Index {index} is not available and can not be fetched.") + elif self.indices_overview.loc[index, "installed"]: + logger.warning( + f"Index {index} already installed and will not be fetched again." + ) + else: + response = requests.get(self.indices_overview.loc[index, "url"], timeout=30) + if response.status_code == 200: + filepath = os.path.join( + idc_index_data.IDC_INDEX_PARQUET_FILEPATH.parents[0], + f"{index}.parquet", + ) + with open(filepath, mode="wb") as file: + file.write(response.content) + self.indices_overview.loc[index, "installed"] = True + self.indices_overview.loc[index, "local_path"] = filepath + else: + logger.error(f"Failed to fetch index: {response.status_code}") + def get_collections(self): """ Returns the collections present in IDC diff --git a/tests/idcindex.py b/tests/idcindex.py index 7e6c4a4b..a0c5bd34 100644 --- a/tests/idcindex.py +++ b/tests/idcindex.py @@ -483,6 +483,16 @@ def test_prior_version_manifest(self): with open(temp_manifest_file) as file: assert len(file.readlines()) == 0 + def test_list_indices(self): + i = IDCClient() + assert not i.indices_overview.empty # assert that df was created + + def test_fetch_index(self): + i = IDCClient() + assert i.indices_overview["sm_index", "installed"] is False + i.fetch_index("sm_index") + assert i.indices_overview["sm_index", "installed"] is True + if __name__ == "__main__": unittest.main() From 3e5b526b5fb10234907af39122efaa07efe324eb Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Fri, 26 Jul 2024 10:21:51 -0400 Subject: [PATCH 2/6] ENH: Replaced dynamic Github release asset access with manually maintained table. Co-authored-by: Daniela Schacherer --- idc_index/index.py | 90 ++++++++++++---------------------------------- 1 file changed, 23 insertions(+), 67 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index abe4787c..4089342c 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -21,6 +21,7 @@ aws_endpoint_url = "https://s3.amazonaws.com" gcp_endpoint_url = "https://storage.googleapis.com" +asset_endpoint_url = f"https://api.github.com/repos/ImagingDataCommons/idc-index-data/releases/tags/{idc_index_data.__version__}" logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO) logger = logging.getLogger(__name__) @@ -67,7 +68,24 @@ def __init__(self): self.collection_summary = self.index.groupby("collection_id").agg( {"Modality": pd.Series.unique, "series_size_MB": "sum"} ) - self.indices_overview = self.list_indices() + + self.indices_overview = pd.DataFrame( + { + "index": {"description": None, "installed": True, "url": None}, + "sm_index": { + "description": None, + "installed": True, + "url": os.path.join(asset_endpoint_url, "sm_index.parquet"), + }, + "sm_instance_index": { + "description": None, + "installed": True, + "url": os.path.join( + asset_endpoint_url, "sm_instance_index.parquet" + ), + }, + } + ) # Lookup s5cmd self.s5cmdPath = shutil.which("s5cmd") @@ -172,33 +190,6 @@ def get_idc_version(): idc_version = Version(idc_index_data.__version__).major return f"v{idc_version}" - @staticmethod - def _get_latest_idc_index_data_release_assets(): - """ - Retrieves a list of the latest idc-index-data release assets. - - Returns: - release_assets (list): List of tuples (asset_name, asset_url). - """ - release_assets = [] - url = f"https://api.github.com/repos/ImagingDataCommons/idc-index-data/releases/tags/{idc_index_data.__version__}" - try: - response = requests.get(url, timeout=30) - if response.status_code == 200: - release_data = response.json() - assets = release_data.get("assets", []) - for asset in assets: - release_assets.append( - (asset["name"], asset["browser_download_url"]) - ) - else: - logger.error(f"Failed to fetch releases: {response.status_code}") - - except FileNotFoundError: - logger.error(f"Failed to fetch releases: {response.status_code}") - - return release_assets - def list_indices(self): """ Lists all available indices including their installation status. @@ -207,40 +198,6 @@ def list_indices(self): indices_overview (pd.DataFrame): DataFrame containing information per index. """ - if "indices_overview" not in locals(): - indices_overview = {} - # Find installed indices - for file in distribution("idc-index-data").files: - if str(file).endswith("index.parquet"): - index_name = os.path.splitext( - str(file).rsplit("/", maxsplit=1)[-1] - )[0] - - indices_overview[index_name] = { - "description": None, - "installed": True, - "local_path": os.path.join( - idc_index_data.IDC_INDEX_PARQUET_FILEPATH.parents[0], - f"{index_name}.parquet", - ), - } - - # Find available indices from idc-index-data - release_assets = self._get_latest_idc_index_data_release_assets() - for asset_name, asset_url in release_assets: - if asset_name.endswith(".parquet"): - asset_name = os.path.splitext(asset_name)[0] - if asset_name not in indices_overview: - indices_overview[asset_name] = { - "description": None, - "installed": False, - "url": asset_url, - } - - self.indices_overview = pd.DataFrame.from_dict( - indices_overview, orient="index" - ) - return self.indices_overview def fetch_index(self, index) -> None: @@ -251,14 +208,14 @@ def fetch_index(self, index) -> None: index (str): Name of the index to be downloaded. """ - if index not in self.indices_overview.index.tolist(): + if index not in self.indices_overview.keys(): logger.error(f"Index {index} is not available and can not be fetched.") - elif self.indices_overview.loc[index, "installed"]: + elif self.indices_overview[index]["installed"]: logger.warning( f"Index {index} already installed and will not be fetched again." ) else: - response = requests.get(self.indices_overview.loc[index, "url"], timeout=30) + response = requests.get(self.indices_overview[index]["url"], timeout=30) if response.status_code == 200: filepath = os.path.join( idc_index_data.IDC_INDEX_PARQUET_FILEPATH.parents[0], @@ -266,8 +223,7 @@ def fetch_index(self, index) -> None: ) with open(filepath, mode="wb") as file: file.write(response.content) - self.indices_overview.loc[index, "installed"] = True - self.indices_overview.loc[index, "local_path"] = filepath + self.indices_overview[index]["installed"] = True else: logger.error(f"Failed to fetch index: {response.status_code}") From 85f40917c10a3e85c92ad819bca0ff01c100b4ad Mon Sep 17 00:00:00 2001 From: ds_93 Date: Thu, 1 Aug 2024 16:42:52 +0200 Subject: [PATCH 3/6] BUG: fixing CI issues * fixed URL for download and handling of indices overview * fixed pylint bug about iterating dictionaries * fixed tests for list_indices and fetch_index --- idc_index/index.py | 40 +++++++++++++++++++--------------------- tests/idcindex.py | 6 +++--- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index 4089342c..089b4a69 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -21,7 +21,7 @@ aws_endpoint_url = "https://s3.amazonaws.com" gcp_endpoint_url = "https://storage.googleapis.com" -asset_endpoint_url = f"https://api.github.com/repos/ImagingDataCommons/idc-index-data/releases/tags/{idc_index_data.__version__}" +asset_endpoint_url = f"https://github.com/ImagingDataCommons/idc-index-data/releases/download/{idc_index_data.__version__}" logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO) logger = logging.getLogger(__name__) @@ -69,23 +69,19 @@ def __init__(self): {"Modality": pd.Series.unique, "series_size_MB": "sum"} ) - self.indices_overview = pd.DataFrame( - { - "index": {"description": None, "installed": True, "url": None}, - "sm_index": { - "description": None, - "installed": True, - "url": os.path.join(asset_endpoint_url, "sm_index.parquet"), - }, - "sm_instance_index": { - "description": None, - "installed": True, - "url": os.path.join( - asset_endpoint_url, "sm_instance_index.parquet" - ), - }, - } - ) + self.indices_overview = { + "index": {"description": None, "installed": True, "url": None}, + "sm_index": { + "description": None, + "installed": False, + "url": f"{asset_endpoint_url}/sm_index.parquet", + }, + "sm_instance_index": { + "description": None, + "installed": False, + "url": f"{asset_endpoint_url}/sm_instance_index.parquet", + }, + } # Lookup s5cmd self.s5cmdPath = shutil.which("s5cmd") @@ -198,7 +194,7 @@ def list_indices(self): indices_overview (pd.DataFrame): DataFrame containing information per index. """ - return self.indices_overview + return pd.DataFrame.from_dict(self.indices_overview, orient="index") def fetch_index(self, index) -> None: """ @@ -208,7 +204,7 @@ def fetch_index(self, index) -> None: index (str): Name of the index to be downloaded. """ - if index not in self.indices_overview.keys(): + if index not in self.indices_overview: logger.error(f"Index {index} is not available and can not be fetched.") elif self.indices_overview[index]["installed"]: logger.warning( @@ -225,7 +221,9 @@ def fetch_index(self, index) -> None: file.write(response.content) self.indices_overview[index]["installed"] = True else: - logger.error(f"Failed to fetch index: {response.status_code}") + logger.error( + f"Failed to fetch index from URL {self.indices_overview[index]['url']}: {response.status_code}" + ) def get_collections(self): """ diff --git a/tests/idcindex.py b/tests/idcindex.py index a0c5bd34..e82e015f 100644 --- a/tests/idcindex.py +++ b/tests/idcindex.py @@ -485,13 +485,13 @@ def test_prior_version_manifest(self): def test_list_indices(self): i = IDCClient() - assert not i.indices_overview.empty # assert that df was created + assert i.indices_overview # assert that dict was created def test_fetch_index(self): i = IDCClient() - assert i.indices_overview["sm_index", "installed"] is False + assert i.indices_overview["sm_index"]["installed"] is False i.fetch_index("sm_index") - assert i.indices_overview["sm_index", "installed"] is True + assert i.indices_overview["sm_index"]["installed"] is True if __name__ == "__main__": From 0c36cb118d765180bb0d7f8ce5053a605e60081e Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Thu, 1 Aug 2024 12:56:55 -0400 Subject: [PATCH 4/6] DOC: add brief descriptions for indices --- idc_index/index.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index 089b4a69..8eb26568 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -70,14 +70,18 @@ def __init__(self): ) self.indices_overview = { - "index": {"description": None, "installed": True, "url": None}, + "index": { + "description": "Main index containing one row per DICOM series.", + "installed": True, + "url": None, + }, "sm_index": { - "description": None, + "description": "DICOM Slide Microscopy series-level index.", "installed": False, "url": f"{asset_endpoint_url}/sm_index.parquet", }, "sm_instance_index": { - "description": None, + "description": "DICOM Slide Microscopy instance-level index.", "installed": False, "url": f"{asset_endpoint_url}/sm_instance_index.parquet", }, From 98375be374a5a5ad36ad333dfa23bc200394df5d Mon Sep 17 00:00:00 2001 From: ds_93 Date: Fri, 2 Aug 2024 13:56:31 +0200 Subject: [PATCH 5/6] ENH: now setting class variable within fetch_index function and included this in the respective test --- idc_index/index.py | 1 + tests/idcindex.py | 1 + 2 files changed, 2 insertions(+) diff --git a/idc_index/index.py b/idc_index/index.py index 8eb26568..5cc76be8 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -223,6 +223,7 @@ def fetch_index(self, index) -> None: ) with open(filepath, mode="wb") as file: file.write(response.content) + setattr(self.__class__, index, pd.read_parquet(filepath)) self.indices_overview[index]["installed"] = True else: logger.error( diff --git a/tests/idcindex.py b/tests/idcindex.py index e82e015f..168ac1f0 100644 --- a/tests/idcindex.py +++ b/tests/idcindex.py @@ -492,6 +492,7 @@ def test_fetch_index(self): assert i.indices_overview["sm_index"]["installed"] is False i.fetch_index("sm_index") assert i.indices_overview["sm_index"]["installed"] is True + assert hasattr(i, "sm_index") if __name__ == "__main__": From aa021d5b05579f2665dec2212c26aed4b92fc56a Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Fri, 2 Aug 2024 17:50:18 -0400 Subject: [PATCH 6/6] STYLE: remove DF conversion function for the index overview In the future, we can add convenience conversion util function that could serve same purpose without being attached to a specific class variable --- idc_index/index.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/idc_index/index.py b/idc_index/index.py index 5cc76be8..1ec55233 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -190,16 +190,6 @@ def get_idc_version(): idc_version = Version(idc_index_data.__version__).major return f"v{idc_version}" - def list_indices(self): - """ - Lists all available indices including their installation status. - - Returns: - indices_overview (pd.DataFrame): DataFrame containing information per index. - """ - - return pd.DataFrame.from_dict(self.indices_overview, orient="index") - def fetch_index(self, index) -> None: """ Downloads requested index.