Merge branch 'main' into feat-download-cli

ImagingDataCommons · May 29, 2024 · 60ed607 · 60ed607
2 parents 76ad79b + 13a9bb7
commit 60ed607
Show file tree

Hide file tree

Showing 4 changed files with 160 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -21,32 +21,43 @@
 
 ## About
 
-`idc-index` is a Python package that enables query of the basic metadata and
-download of DICOM files hosted by the
-[NCI Imaging Data Commons (IDC)](https://imaging.datacommons.cancer.gov).
+`idc-index` is a Python package that enables basic operations for working with
+[NCI Imaging Data Commons (IDC)](https://imaging.datacommons.cancer.gov):
 
-## Usage
+- subsetting of the IDC data using selected metadata attributes
+- download of the files corresponding to selection
+- generation of the viewer URLs for the selected data
 
-There are no prerequisites - just install the package ...
+## Getting started
+
+Install the latest version of the package.
 
 ```bash
-$ pip install idc-index
+$ pip install --upgrade idc-index
 ```
 
-... and download files corresponding to any collection, DICOM
-PatientID/Study/Series as follows:
+Instantiate `IDCClient`, which provides the interface for main operations.
 
 ```python
 from idc_index import index
 
 client = index.IDCClient()
+```
 
-all_collection_ids = client.get_collections()
+You can use [IDC Portal](https://imaging.datacommons.cancer.gov/explore) to
+browse collections, cases, studies and series, copy their identifiers and
+download the corresponding files using `idc-index` helper functions.
 
-client.download_from_selection(collection_id="rider_pilot", downloadDir="/some/dir")
+You can try this out with the `rider_pilot` collection, which is just 10.5 GB in
+size:
+
+```
+client.download_from_selection(collection_id="rider_pilot", downloadDir=".")
 ```
 
-... or run queries against the "mini" index of Imaging Data Commons data!
+... or run queries against the "mini" index of Imaging Data Commons data, and
+download images that match your selection criteria! The following will select
+all Magnetic Resonance (MR) series, and will download the first 10.
 
 ```python
 from idc_index import index
@@ -55,31 +66,25 @@ client = index.IDCClient()
 
 query = """
 SELECT
-  collection_id,
-  STRING_AGG(DISTINCT(Modality)) as modalities,
-  STRING_AGG(DISTINCT(BodyPartExamined)) as body_parts
+  SeriesInstanceUID
 FROM
   index
-GROUP BY
-  collection_id
-ORDER BY
-  collection_id ASC
+WHERE
+  Modality = 'MR'
 """
 
-client.sql_query(query)
-```
+selection_df = client.sql_query(query)
 
-Details of the attributes included in the index are in the release notes.
+client.download_from_selection(
+    seriesInstanceUID=selection_df["SeriesInstanceUID"].values[:10], downloadDir="."
+)
+```
 
 ## Tutorial
 
-This package was first presented at the 2023 Annual meeting of Radiological
-Society of North America (RSNA) Deep Learning Lab
-[IDC session](https://github.com/RSNA/AI-Deep-Learning-Lab-2023/tree/main/sessions/idc).
-
 Please check out
 [this tutorial notebook](https://github.com/ImagingDataCommons/IDC-Tutorials/blob/master/notebooks/labs/idc_rsna2023.ipynb)
-for the introduction into using `idc-index` for navigating IDC data.
+for the introduction into using `idc-index`.
 
 ## Resources
 

diff --git a/docs/index.md b/docs/index.md
@@ -21,6 +21,51 @@ starting a discussion in [IDC User forum](https://discourse.canceridc.dev/).
 :start-after: <!-- SPHINX-START -->
 ```
 
+## The `index` of `idc-index`
+
+`idc-index` is named this way because it wraps index of IDC data: a table
+containing most important metadata attributes describing the files available in
+IDC. This metadata index is available in the `index` variable (which is a pandas
+`DataFrame`) of `IDCClient`.
+
+The following is the list of the columns included in `index`. You can use those
+to select cohorts and subsetting data. `idc-index` is series-based, i.e, it has
+one row per DICOM series.
+
+- non-DICOM attributes assigned/curated by IDC:
+
+  - `collection_id`: short string with the identifier of the collection the
+    series belongs to
+  - `analysis_result_id`: this string is not empty if the specific series is
+    part of an analysis results collection; analysis results can be added to a
+    given collection over time
+  - `source_DOI`: Digital Object Identifier of the dataset that contains the
+    given series; note that a given collection can include one or more DOIs,
+    since analysis results added to the collection would typically have
+    independent DOI values!
+  - `instanceCount`: number of files in the series (typically, this matches the
+    number of slices in cross-sectional modalities)
+  - `license_short_name`: short name of the license that governs the use of the
+    files corresponding to the series
+  - `series_aws_url`: location of the series files in a public AWS bucket
+  - `series_size_MB`: total disk size needed to store the series
+
+- DICOM attributes extracted from the files
+  - `PatientID`: identifier of the patient
+  - `PatientAge` and `PatientSex`: attributes containing patient age and sex
+  - `StudyInstanceUID`: unique identifier of the DICOM study
+  - `StudyDescription`: textual description of the study content
+  - `StudyDate`: date of the study (note that those dates are shifted, and are
+    not real dates when images were acquired, to protect patient privacy)
+  - `SeriesInstanceUID`: unique identifier of the DICOM series
+  - `SeriesDate`: date when the series was acquired
+  - `SeriesDescription`: textual description of the series content
+  - `SeriesNumber`: series number
+  - `BodyPartExamined`: body part imaged
+  - `Modality`: acquisition modality
+  - `Manufacturer`: manufacturer of the equipment that generated the series
+  - `ManufacturerModelName`: model name of the equipment
+
 ## Contents
 
 ```{toctree}

diff --git a/idc_index/index.py b/idc_index/index.py
@@ -783,11 +783,7 @@ def _track_download_progress(
             initial_size_bytes = 0
             # Calculate the initial size of the directory
             for directory in list_of_directories:
-                path = Path(directory)
-                if path.exists() and path.is_dir():
-                    initial_size_bytes += sum(
-                        f.stat().st_size for f in path.iterdir() if f.is_file()
-                    )
+                initial_size_bytes = IDCClient._get_dir_sum_file_size(directory)
 
             logger.info("Initial size of the directory: %s bytes", initial_size_bytes)
             logger.info(
@@ -805,11 +801,7 @@ def _track_download_progress(
             while True:
                 downloaded_bytes = 0
                 for directory in list_of_directories:
-                    path = Path(directory)
-                    if path.exists() and path.is_dir():
-                        downloaded_bytes += sum(
-                            f.stat().st_size for f in path.iterdir() if f.is_file()
-                        )
+                    downloaded_bytes += IDCClient._get_dir_sum_file_size(directory)
                 downloaded_bytes -= initial_size_bytes
                 pbar.n = min(
                     downloaded_bytes, total_size_bytes
@@ -829,6 +821,21 @@ def _track_download_progress(
             while process.poll() is None:
                 time.sleep(0.5)
 
+    @staticmethod
+    def _get_dir_sum_file_size(directory) -> int:
+        path = Path(directory)
+        sum_file_size = 0
+        if path.exists() and path.is_dir():
+            for f in path.iterdir():
+                if f.is_file():
+                    try:
+                        sum_file_size += f.stat().st_size
+                    except FileNotFoundError:
+                        # file must have been removed before we
+                        # could get its size
+                        pass
+        return sum_file_size
+
     def _parse_s5cmd_sync_output_and_generate_synced_manifest(
         self, stdout, downloadDir, dirTemplate
     ) -> Path:
@@ -1180,6 +1187,54 @@ def download_from_manifest(
             list_of_directories=list_of_directories,
         )
 
+    def citations_from_manifest(
+        self,
+        manifestFile: str,
+        citation_format: str = CITATION_FORMAT_APA,
+    ):
+        """Get the list of publications that should be cited/attributed for a cohort defined by a manifest.
+
+        Args:
+            manifestFile (str: string containing the path to the manifest file.
+            format (str): string containing the format of the citation. Must be one of the following: CITATION_FORMAT_APA, CITATION_FORMAT_BIBTEX, CITATION_FORMAT_JSON. Defaults to CITATION_FORMAT_APA. Can be initialized to the alternative formats as allowed by DOI API, see https://citation.crosscite.org/docs.html#sec-4.
+
+        Returns:
+            List of citations in the requested format.
+        """
+
+        manifest_df = pd.read_csv(
+            manifestFile,
+            comment="#",
+            skip_blank_lines=True,
+            header=None,
+            names=["manifest_line"],
+        )
+        uuid_pattern = r"s3://.*/([^/]+)/\*"
+        manifest_df["crdc_series_uuid"] = manifest_df["manifest_line"].str.extract(
+            uuid_pattern, expand=False
+        )
+        index_copy = self.index[["series_aws_url", "SeriesInstanceUID"]].copy()
+        index_copy["crdc_series_uuid"] = index_copy["series_aws_url"].str.extract(
+            uuid_pattern, expand=False
+        )
+        query = """
+        SELECT
+          SeriesInstanceUID
+        FROM
+          index_copy
+        JOIN
+          manifest_df
+        ON
+          index_copy.crdc_series_uuid = manifest_df.crdc_series_uuid
+        """
+
+        result_df = self.sql_query(query)
+
+        return self.citations_from_selection(
+            seriesInstanceUID=result_df["SeriesInstanceUID"].tolist(),
+            citation_format=citation_format,
+        )
+
     def citations_from_selection(
         self,
         collection_id=None,
@@ -1193,9 +1248,12 @@ def citations_from_selection(
         Args:
             collection_id: string or list of strings containing the values of collection_id to filter by
             patientId: string or list of strings containing the values of PatientID to filter by
-            studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by
+            studyInstanceUID (str): string or list of strings containing the values of DICOM StudyInstanceUID to filter by
             seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by
             format: string containing the format of the citation. Must be one of the following: CITATION_FORMAT_APA, CITATION_FORMAT_BIBTEX, CITATION_FORMAT_JSON. Defaults to CITATION_FORMAT_APA. Can be initialized to the alternative formats as allowed by DOI API, see https://citation.crosscite.org/docs.html#sec-4.
+
+        Returns:
+            List of citations in the requested format.
         """
         result_df = self._safe_filter_by_selection(
             self.index,
@@ -1224,15 +1282,24 @@ def citations_from_selection(
             for doi in distinct_dois:
                 url = "https://dx.doi.org/" + doi
 
+                logger.debug(f"Requesting citation for DOI: {doi}")
+
                 response = requests.get(url, headers=headers, timeout=timeout)
 
+                logger.debug("Received response: " + str(response.status_code))
+
                 if response.status_code == 200:
                     if citation_format == self.CITATION_FORMAT_JSON:
                         citations.append(response.json())
                     else:
                         citations.append(response.text)
+                    logger.debug("Received citation: " + citations[-1])
+
                 else:
                     logger.error(f"Failed to get citation for DOI: {url}")
+                    logger.error(
+                        f"DOI server response status code: {response.status_code}"
+                    )
 
         return citations
 

diff --git a/tests/idcindex.py b/tests/idcindex.py
@@ -354,6 +354,8 @@ def test_download_from_bogus_manifest(self):
 
                 self.assertEqual(len(os.listdir(temp_dir)), 0)
 
+    """
+    disabling these tests due to a consistent server timeout issue
     def test_citations(self):
         citations = self.client.citations_from_selection(
             collection_id="tcga_gbm",
@@ -373,6 +375,10 @@ def test_citations(self):
         )
         self.assertIsNotNone(citations)
 
+        citations = self.client.citations_from_manifest("./study_manifest_aws.s5cmd")
+        self.assertIsNotNone(citations)
+    """
+
     def test_cli_download_from_selection(self):
         runner = CliRunner()
         with tempfile.TemporaryDirectory() as temp_dir: