Skip to content

Commit

Permalink
Merge pull request #108 from vkt1414/feat-add-metadata-for-series-not…
Browse files Browse the repository at this point in the history
…-in-latest-index-for-idc

feat: enable downloading data from previous idc data versions
  • Loading branch information
fedorov authored Aug 7, 2024
2 parents 26e6d13 + 01dc5ab commit 399a7b0
Show file tree
Hide file tree
Showing 3 changed files with 117 additions and 28 deletions.
141 changes: 115 additions & 26 deletions idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ def __init__(self):
file_path = idc_index_data.IDC_INDEX_PARQUET_FILEPATH
logger.debug(f"Reading index file v{idc_index_data.__version__}")
self.index = pd.read_parquet(file_path)

self.previous_versions_index_path = (
idc_index_data.PRIOR_VERSIONS_INDEX_PARQUET_FILEPATH
)

# self.index = self.index.astype(str).replace("nan", "")
self.index["series_size_MB"] = self.index["series_size_MB"].astype(float)
self.collection_summary = self.index.groupby("collection_id").agg(
Expand All @@ -75,6 +80,11 @@ def __init__(self):
"installed": True,
"url": None,
},
"previous_versions_index": {
"description": "index containing one row per DICOM series from all previous IDC versions that are not in current version.",
"installed": True,
"url": None,
},
"sm_index": {
"description": "DICOM Slide Microscopy series-level index.",
"installed": False,
Expand Down Expand Up @@ -617,34 +627,44 @@ def _validate_update_manifest_and_get_download_size(
# create a copy of the index
index_df_copy = self.index

# use default hierarchy
if dirTemplate is not None:
hierarchy = self._generate_sql_concat_for_building_directory(
dirTemplate=dirTemplate, downloadDir=downloadDir
)
else:
hierarchy = "NULL"

# Extract s3 url and crdc_series_uuid from the manifest copy commands
# Next, extract crdc_series_uuid from aws_series_url in the index and
# try to verify if every series in the manifest is present in the index

# TODO: need to remove the assumption that manifest commands will have 'cp'
# and need to parse S3 URL directly
# ruff: noqa
sql = """
sql = f"""
PRAGMA disable_progress_bar;
WITH
index_temp AS (
SELECT
seriesInstanceUID,
series_aws_url,
series_size_MB,
REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){3}([^\\/?#]+)', 1) index_crdc_series_uuid
{hierarchy} AS path,
REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid
FROM
index_df_copy),
manifest_temp AS (
SELECT
manifest_cp_cmd,
REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){3}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid,
REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid,
REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') AS s3_url,
FROM
manifest_df )
SELECT
seriesInstanceuid,
s3_url,
path,
series_size_MB,
index_crdc_series_uuid is not NULL as crdc_series_uuid_match,
s3_url==series_aws_url AS s3_url_match,
Expand Down Expand Up @@ -679,6 +699,75 @@ def _validate_update_manifest_and_get_download_size(
)
logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist()))

logger.debug(
"Checking if the requested data is available in other idc versions "
)
missing_series_sql = f"""
PRAGMA disable_progress_bar;
WITH
combined_index AS
(SELECT
*,
{hierarchy} AS path,
FROM
index_df_copy
union by name
SELECT
*,
{hierarchy} AS path,
FROM
'{self.previous_versions_index_path}' pvip
),
index_temp AS (
SELECT
seriesInstanceUID,
series_aws_url,
series_size_MB,
path,
REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) index_crdc_series_uuid
FROM
combined_index),
manifest_temp AS (
SELECT
manifest_cp_cmd,
REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid,
REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') AS s3_url,
FROM
manifest_df )
SELECT
seriesInstanceuid,
s3_url,
path,
series_size_MB,
index_crdc_series_uuid is not NULL as crdc_series_uuid_match,
TRIM(s3_url) = TRIM(series_aws_url) AS s3_url_match,
manifest_temp.manifest_cp_cmd,
CASE
WHEN TRIM(s3_url) = TRIM(series_aws_url) THEN 'aws'
ELSE
'unknown'
END
AS endpoint
FROM
manifest_temp
LEFT JOIN
index_temp
ON
index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid
"""
merged_df = duckdb.query(missing_series_sql).df()
if not all(merged_df["crdc_series_uuid_match"]):
missing_manifest_cp_cmds = merged_df.loc[
~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
]
logger.error(
"The following manifest copy commands are not recognized as referencing any associated series in any release of IDC.\n"
"This means either these commands are invalid. Please submit an issue on https://github.com/ImagingDataCommons/idc-index/issues \n"
"The corresponding files could not be downloaded.\n"
)
logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist()))

if validate_manifest:
# Check if there is more than one endpoint
if len(merged_df["endpoint"].unique()) > 1:
Expand Down Expand Up @@ -728,29 +817,29 @@ def _validate_update_manifest_and_get_download_size(
total_size = merged_df["series_size_MB"].sum()
total_size = round(total_size, 2)

if dirTemplate is not None:
hierarchy = self._generate_sql_concat_for_building_directory(
dirTemplate=dirTemplate, downloadDir=downloadDir
)
sql = f"""
WITH temp as
(
SELECT
seriesInstanceUID,
s3_url
FROM
merged_df
)
SELECT
s3_url,
{hierarchy} as path
FROM
temp
JOIN
index using (seriesInstanceUID)
"""
logger.debug(f"About to run this query:\n{sql}")
merged_df = self.sql_query(sql)
# if dirTemplate is not None:
# hierarchy = self._generate_sql_concat_for_building_directory(
# dirTemplate=dirTemplate, downloadDir=downloadDir
# )
# sql = f"""
# WITH temp as
# (
# SELECT
# seriesInstanceUID,
# s3_url
# FROM
# merged_df
# )
# SELECT
# s3_url,
# {hierarchy} as path
# FROM
# temp
# JOIN
# index using (seriesInstanceUID)
# """
# logger.debug(f"About to run this query:\n{sql}")
# merged_df = self.sql_query(sql)
# Write a temporary manifest file
with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_manifest_file:
if use_s5cmd_sync and len(os.listdir(downloadDir)) != 0:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ dynamic = ["version"]
dependencies = [
"click",
'duckdb>=0.10.0',
"idc-index-data==18.1.0",
"idc-index-data==18.2.0",
"packaging",
"pandas<2.2",
"psutil",
Expand Down
2 changes: 1 addition & 1 deletion tests/idcindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ def test_prior_version_manifest(self):
# is fully resolved, the manifest below should not be empty, and this test should be updated
# with count equal to 5
with open(temp_manifest_file) as file:
assert len(file.readlines()) == 0
assert len(file.readlines()) == 5

def test_list_indices(self):
i = IDCClient()
Expand Down

0 comments on commit 399a7b0

Please sign in to comment.