Skip to content

Commit

Permalink
Merge pull request #105 from fedorov/100-fix-error-reporting
Browse files Browse the repository at this point in the history
ENH: improve error reporting for unrecognized items from the manifest
  • Loading branch information
fedorov authored Jul 31, 2024
2 parents b8bd83a + 7a516d8 commit ab19b89
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 13 deletions.
31 changes: 18 additions & 13 deletions idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,8 +574,8 @@ def _validate_update_manifest_and_get_download_size(
# create a copy of the index
index_df_copy = self.index

# Extract s3 url and crdc_instance_uuid from the manifest copy commands
# Next, extract crdc_instance_uuid from aws_series_url in the index and
# Extract s3 url and crdc_series_uuid from the manifest copy commands
# Next, extract crdc_series_uuid from aws_series_url in the index and
# try to verify if every series in the manifest is present in the index

# TODO: need to remove the assumption that manifest commands will have 'cp'
Expand Down Expand Up @@ -603,8 +603,9 @@ def _validate_update_manifest_and_get_download_size(
seriesInstanceuid,
s3_url,
series_size_MB,
index_crdc_series_uuid==manifest_crdc_series_uuid AS crdc_series_uuid_match,
index_crdc_series_uuid is not NULL as crdc_series_uuid_match,
s3_url==series_aws_url AS s3_url_match,
manifest_temp.manifest_cp_cmd,
CASE
WHEN s3_url==series_aws_url THEN 'aws'
ELSE
Expand All @@ -623,19 +624,23 @@ def _validate_update_manifest_and_get_download_size(

endpoint_to_use = None

if validate_manifest:
# Check if crdc_instance_uuid is found in the index
if not all(merged_df["crdc_series_uuid_match"]):
missing_manifest_cp_cmds = merged_df.loc[
~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
]
missing_manifest_cp_cmds_str = f"The following manifest copy commands do not have any associated series in the index: {missing_manifest_cp_cmds.tolist()}"
raise ValueError(missing_manifest_cp_cmds_str)
# Check if any crdc_series_uuid are not found in the index
if not all(merged_df["crdc_series_uuid_match"]):
missing_manifest_cp_cmds = merged_df.loc[
~merged_df["crdc_series_uuid_match"], "manifest_cp_cmd"
]
logger.error(
"The following manifest copy commands are not recognized as referencing any associated series in the index.\n"
"This means either these commands are invalid, or they may correspond to files available in a release of IDC\n"
f"different from {self.get_idc_version()} used in this version of idc-index. The corresponding files will not be downloaded.\n"
)
logger.error("\n" + "\n".join(missing_manifest_cp_cmds.tolist()))

# Check if there are more than one endpoints
if validate_manifest:
# Check if there is more than one endpoint
if len(merged_df["endpoint"].unique()) > 1:
raise ValueError(
"Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. If so, please use urls from one provider only"
"Either GCS bucket path is invalid or manifest has a mix of GCS and AWS urls. "
)

if (
Expand Down
22 changes: 22 additions & 0 deletions tests/idcindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,28 @@ def test_cli_download(self):
)
assert len(os.listdir(Path.cwd())) != 0

def test_prior_version_manifest(self):
c = IDCClient()
with tempfile.TemporaryDirectory() as temp_dir:
(
total_size,
endpoint_to_use,
temp_manifest_file,
list_of_directories,
) = c._validate_update_manifest_and_get_download_size(
"./prior_version_manifest.s5cmd",
temp_dir,
True,
False,
IDCClient.DOWNLOAD_HIERARCHY_DEFAULT,
)

# TODO: once issue https://github.com/ImagingDataCommons/idc-index/issues/100
# is fully resolved, the manifest below should not be empty, and this test should be updated
# with count equal to 5
with open(temp_manifest_file) as file:
assert len(file.readlines()) == 0


if __name__ == "__main__":
unittest.main()
5 changes: 5 additions & 0 deletions tests/prior_version_manifest.s5cmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
cp s3://idc-open-data/040fd3e1-0088-4bfd-8439-55e3c5d80a56/* .
cp s3://idc-open-data/04553d0f-1af9-414d-b631-cc31624aced5/* .
cp s3://idc-open-data/068346bf-16ef-4e45-87bf-87feb576a21c/* .
cp s3://idc-open-data/07908d47-5e85-45f3-9649-79c15f606f52/* .
cp s3://idc-open-data/099d180f-1d79-402d-abad-bfd8e2736b04/* .

0 comments on commit ab19b89

Please sign in to comment.