Merge pull request #84 from fedorov/31-citations-list

enh: add functionality to generate the list of citations for the selection
ImagingDataCommons · May 17, 2024 · 76e5951 · 76e5951
2 parents 350079c + 7e9887e
commit 76e5951
Show file tree

Hide file tree

Showing 3 changed files with 136 additions and 33 deletions.
diff --git a/idc_index/index.py b/idc_index/index.py
@@ -15,21 +15,30 @@
 import idc_index_data
 import pandas as pd
 import psutil
+import requests
 from packaging.version import Version
 from tqdm import tqdm
 
 logger = logging.getLogger(__name__)
-logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG)
+logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
 
 aws_endpoint_url = "https://s3.amazonaws.com"
 gcp_endpoint_url = "https://storage.googleapis.com"
 
 
 class IDCClient:
+    # Default download hierarchy template
     DOWNLOAD_HIERARCHY_DEFAULT = (
         "%collection_id/%PatientID/%StudyInstanceUID/%Modality_%SeriesInstanceUID"
     )
 
+    # Defined citation formats that can be passed to the citations request methods
+    # see acceptable values at https://citation.crosscite.org/docs.html#sec-4
+    CITATION_FORMAT_APA = "text/x-bibliography; style=apa; locale=en-US"
+    CITATION_FORMAT_TURTLE = "text/turtle"
+    CITATION_FORMAT_JSON = "application/vnd.citationstyles.csl+json"
+    CITATION_FORMAT_BIBTEX = "application/x-bibtex"
+
     def __init__(self):
         file_path = idc_index_data.IDC_INDEX_PARQUET_FILEPATH
 
@@ -77,6 +86,49 @@ def _filter_dataframe_by_id(key, dataframe, _id):
             raise ValueError(error_message)
         return filtered_df
 
+    @staticmethod
+    def _safe_filter_by_selection(
+        df_index, collection_id, patientId, studyInstanceUID, seriesInstanceUID
+    ):
+        if collection_id is not None:
+            if not isinstance(collection_id, str) and not isinstance(
+                collection_id, list
+            ):
+                raise TypeError("collection_id must be a string or list of strings")
+        if patientId is not None:
+            if not isinstance(patientId, str) and not isinstance(patientId, list):
+                raise TypeError("patientId must be a string or list of strings")
+        if studyInstanceUID is not None:
+            if not isinstance(studyInstanceUID, str) and not isinstance(
+                studyInstanceUID, list
+            ):
+                raise TypeError("studyInstanceUID must be a string or list of strings")
+        if seriesInstanceUID is not None:
+            if not isinstance(seriesInstanceUID, str) and not isinstance(
+                seriesInstanceUID, list
+            ):
+                raise TypeError("seriesInstanceUID must be a string or list of strings")
+
+        if collection_id is not None:
+            result_df = IDCClient._filter_by_collection_id(df_index, collection_id)
+        else:
+            result_df = df_index
+
+        if patientId is not None:
+            result_df = IDCClient._filter_by_patient_id(result_df, patientId)
+
+        if studyInstanceUID is not None:
+            result_df = IDCClient._filter_by_dicom_study_uid(
+                result_df, studyInstanceUID
+            )
+
+        if seriesInstanceUID is not None:
+            result_df = IDCClient._filter_by_dicom_series_uid(
+                result_df, seriesInstanceUID
+            )
+
+        return result_df
+
     @staticmethod
     def _filter_by_collection_id(df_index, collection_id):
         return IDCClient._filter_dataframe_by_id(
@@ -1128,6 +1180,62 @@ def download_from_manifest(
             list_of_directories=list_of_directories,
         )
 
+    def citations_from_selection(
+        self,
+        collection_id=None,
+        patientId=None,
+        studyInstanceUID=None,
+        seriesInstanceUID=None,
+        citation_format=CITATION_FORMAT_APA,
+    ):
+        """Get the list of publications that should be cited/attributed for the specific collection, patient (case) ID, study or series UID.
+
+        Args:
+            collection_id: string or list of strings containing the values of collection_id to filter by
+            patientId: string or list of strings containing the values of PatientID to filter by
+            studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by
+            seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by
+            format: string containing the format of the citation. Must be one of the following: CITATION_FORMAT_APA, CITATION_FORMAT_BIBTEX, CITATION_FORMAT_JSON. Defaults to CITATION_FORMAT_APA. Can be initialized to the alternative formats as allowed by DOI API, see https://citation.crosscite.org/docs.html#sec-4.
+        """
+        result_df = self._safe_filter_by_selection(
+            self.index,
+            collection_id=collection_id,
+            patientId=patientId,
+            studyInstanceUID=studyInstanceUID,
+            seriesInstanceUID=seriesInstanceUID,
+        )
+
+        citations = []
+
+        if not result_df.empty:
+            distinct_dois = result_df["source_DOI"].unique().tolist()
+
+            if len(distinct_dois) == 0:
+                logger.error("No DOIs found for the selection.")
+                return citations
+
+            # include citation for the currently main IDC publication
+            # https://doi.org/10.1148/rg.230180
+            distinct_dois.append("10.1148/rg.230180")
+
+            headers = {"accept": citation_format}
+            timeout = 30
+
+            for doi in distinct_dois:
+                url = "https://dx.doi.org/" + doi
+
+                response = requests.get(url, headers=headers, timeout=timeout)
+
+                if response.status_code == 200:
+                    if citation_format == self.CITATION_FORMAT_JSON:
+                        citations.append(response.json())
+                    else:
+                        citations.append(response.text)
+                else:
+                    logger.error(f"Failed to get citation for DOI: {url}")
+
+        return citations
+
     def download_from_selection(
         self,
         downloadDir,
@@ -1162,38 +1270,13 @@ def download_from_selection(
         if not os.path.exists(downloadDir):
             raise ValueError("Download directory does not exist.")
 
-        if collection_id is not None:
-            if not isinstance(collection_id, str) and not isinstance(
-                collection_id, list
-            ):
-                raise TypeError("collection_id must be a string or list of strings")
-        if patientId is not None:
-            if not isinstance(patientId, str) and not isinstance(patientId, list):
-                raise TypeError("patientId must be a string or list of strings")
-        if studyInstanceUID is not None:
-            if not isinstance(studyInstanceUID, str) and not isinstance(
-                studyInstanceUID, list
-            ):
-                raise TypeError("studyInstanceUID must be a string or list of strings")
-        if seriesInstanceUID is not None:
-            if not isinstance(seriesInstanceUID, str) and not isinstance(
-                seriesInstanceUID, list
-            ):
-                raise TypeError("seriesInstanceUID must be a string or list of strings")
-
-        if collection_id is not None:
-            result_df = self._filter_by_collection_id(self.index, collection_id)
-        else:
-            result_df = self.index
-
-        if patientId is not None:
-            result_df = self._filter_by_patient_id(result_df, patientId)
-
-        if studyInstanceUID is not None:
-            result_df = self._filter_by_dicom_study_uid(result_df, studyInstanceUID)
-
-        if seriesInstanceUID is not None:
-            result_df = self._filter_by_dicom_series_uid(result_df, seriesInstanceUID)
+        result_df = self._safe_filter_by_selection(
+            self.index,
+            collection_id=collection_id,
+            patientId=patientId,
+            studyInstanceUID=studyInstanceUID,
+            seriesInstanceUID=seriesInstanceUID,
+        )
 
         total_size = round(result_df["series_size_MB"].sum(), 2)
         logger.info("Total size of files to download: " + self._format_size(total_size))

diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
   "pandas<2.2",
   "psutil",
   "pyarrow",
+  "requests",
   "s5cmd",
   "tqdm"
 ]

diff --git a/tests/idcindex.py b/tests/idcindex.py
@@ -351,6 +351,25 @@ def test_download_from_bogus_manifest(self):
 
                 self.assertEqual(len(os.listdir(temp_dir)), 0)
 
+    def test_citations(self):
+        citations = self.client.citations_from_selection(
+            collection_id="tcga_gbm",
+            citation_format=index.IDCClient.CITATION_FORMAT_APA,
+        )
+        self.assertIsNotNone(citations)
+
+        citations = self.client.citations_from_selection(
+            seriesInstanceUID="1.3.6.1.4.1.14519.5.2.1.7695.4164.588007658875211151397302775781",
+            citation_format=index.IDCClient.CITATION_FORMAT_BIBTEX,
+        )
+        self.assertIsNotNone(citations)
+
+        citations = self.client.citations_from_selection(
+            studyInstanceUID="1.2.840.113654.2.55.174144834924218414213677353968537663991",
+            citation_format=index.IDCClient.CITATION_FORMAT_BIBTEX,
+        )
+        self.assertIsNotNone(citations)
+
 
 if __name__ == "__main__":
     unittest.main()