Skip to content

Commit

Permalink
Merge pull request #101 from DanielaSchacherer/main
Browse files Browse the repository at this point in the history
fetch_index implemented
  • Loading branch information
fedorov authored Aug 2, 2024
2 parents ab19b89 + aa021d5 commit 26e6d13
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 7 deletions.
57 changes: 50 additions & 7 deletions idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

aws_endpoint_url = "https://s3.amazonaws.com"
gcp_endpoint_url = "https://storage.googleapis.com"
asset_endpoint_url = f"https://github.com/ImagingDataCommons/idc-index-data/releases/download/{idc_index_data.__version__}"

logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -58,9 +59,8 @@ def client(cls) -> IDCClient:
return cls._client

def __init__(self):
# Read main index file
file_path = idc_index_data.IDC_INDEX_PARQUET_FILEPATH

# Read index file
logger.debug(f"Reading index file v{idc_index_data.__version__}")
self.index = pd.read_parquet(file_path)
# self.index = self.index.astype(str).replace("nan", "")
Expand All @@ -69,9 +69,26 @@ def __init__(self):
{"Modality": pd.Series.unique, "series_size_MB": "sum"}
)

self.indices_overview = {
"index": {
"description": "Main index containing one row per DICOM series.",
"installed": True,
"url": None,
},
"sm_index": {
"description": "DICOM Slide Microscopy series-level index.",
"installed": False,
"url": f"{asset_endpoint_url}/sm_index.parquet",
},
"sm_instance_index": {
"description": "DICOM Slide Microscopy instance-level index.",
"installed": False,
"url": f"{asset_endpoint_url}/sm_instance_index.parquet",
},
}

# Lookup s5cmd
self.s5cmdPath = shutil.which("s5cmd")

if self.s5cmdPath is None:
# Workaround to support environment without a properly setup PATH
# See https://github.com/Slicer/Slicer/pull/7587
Expand All @@ -80,16 +97,12 @@ def __init__(self):
if str(script).startswith("s5cmd/bin/s5cmd"):
self.s5cmdPath = script.locate().resolve(strict=True)
break

if self.s5cmdPath is None:
raise FileNotFoundError(
"s5cmd executable not found. Please install s5cmd from https://github.com/peak/s5cmd#installation"
)

self.s5cmdPath = str(self.s5cmdPath)

logger.debug(f"Found s5cmd executable: {self.s5cmdPath}")

# ... and check it can be executed
subprocess.check_call([self.s5cmdPath, "--help"], stdout=subprocess.DEVNULL)

Expand Down Expand Up @@ -177,6 +190,36 @@ def get_idc_version():
idc_version = Version(idc_index_data.__version__).major
return f"v{idc_version}"

def fetch_index(self, index) -> None:
"""
Downloads requested index.
Args:
index (str): Name of the index to be downloaded.
"""

if index not in self.indices_overview:
logger.error(f"Index {index} is not available and can not be fetched.")
elif self.indices_overview[index]["installed"]:
logger.warning(
f"Index {index} already installed and will not be fetched again."
)
else:
response = requests.get(self.indices_overview[index]["url"], timeout=30)
if response.status_code == 200:
filepath = os.path.join(
idc_index_data.IDC_INDEX_PARQUET_FILEPATH.parents[0],
f"{index}.parquet",
)
with open(filepath, mode="wb") as file:
file.write(response.content)
setattr(self.__class__, index, pd.read_parquet(filepath))
self.indices_overview[index]["installed"] = True
else:
logger.error(
f"Failed to fetch index from URL {self.indices_overview[index]['url']}: {response.status_code}"
)

def get_collections(self):
"""
Returns the collections present in IDC
Expand Down
11 changes: 11 additions & 0 deletions tests/idcindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,17 @@ def test_prior_version_manifest(self):
with open(temp_manifest_file) as file:
assert len(file.readlines()) == 0

def test_list_indices(self):
i = IDCClient()
assert i.indices_overview # assert that dict was created

def test_fetch_index(self):
i = IDCClient()
assert i.indices_overview["sm_index"]["installed"] is False
i.fetch_index("sm_index")
assert i.indices_overview["sm_index"]["installed"] is True
assert hasattr(i, "sm_index")


if __name__ == "__main__":
unittest.main()

0 comments on commit 26e6d13

Please sign in to comment.