From 1b7746d473998558ed99cd115b2ac7d8113d6f8c Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Tue, 21 May 2024 10:47:32 -0400 Subject: [PATCH] enh: add generation of citations from manifest --- idc_index/index.py | 59 +++++++++++++++++++++++++++++++++++++++++++++- tests/idcindex.py | 3 +++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/idc_index/index.py b/idc_index/index.py index 3684fcd1..5f4d76c1 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -1180,6 +1180,54 @@ def download_from_manifest( list_of_directories=list_of_directories, ) + def citations_from_manifest( + self, + manifestFile: str, + citation_format: str = CITATION_FORMAT_APA, + ): + """Get the list of publications that should be cited/attributed for a cohort defined by a manifest. + + Args: + manifestFile (str: string containing the path to the manifest file. + format (str): string containing the format of the citation. Must be one of the following: CITATION_FORMAT_APA, CITATION_FORMAT_BIBTEX, CITATION_FORMAT_JSON. Defaults to CITATION_FORMAT_APA. Can be initialized to the alternative formats as allowed by DOI API, see https://citation.crosscite.org/docs.html#sec-4. + + Returns: + List of citations in the requested format. + """ + + manifest_df = pd.read_csv( + manifestFile, + comment="#", + skip_blank_lines=True, + header=None, + names=["manifest_line"], + ) + uuid_pattern = r"s3://.*/([^/]+)/\*" + manifest_df["crdc_series_uuid"] = manifest_df["manifest_line"].str.extract( + uuid_pattern, expand=False + ) + index_copy = self.index[["series_aws_url", "SeriesInstanceUID"]].copy() + index_copy["crdc_series_uuid"] = index_copy["series_aws_url"].str.extract( + uuid_pattern, expand=False + ) + query = """ + SELECT + SeriesInstanceUID + FROM + index_copy + JOIN + manifest_df + ON + index_copy.crdc_series_uuid = manifest_df.crdc_series_uuid + """ + + result_df = self.sql_query(query) + + return self.citations_from_selection( + seriesInstanceUID=result_df["SeriesInstanceUID"].tolist(), + citation_format=citation_format, + ) + def citations_from_selection( self, collection_id=None, @@ -1193,9 +1241,12 @@ def citations_from_selection( Args: collection_id: string or list of strings containing the values of collection_id to filter by patientId: string or list of strings containing the values of PatientID to filter by - studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by + studyInstanceUID (str): string or list of strings containing the values of DICOM StudyInstanceUID to filter by seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by format: string containing the format of the citation. Must be one of the following: CITATION_FORMAT_APA, CITATION_FORMAT_BIBTEX, CITATION_FORMAT_JSON. Defaults to CITATION_FORMAT_APA. Can be initialized to the alternative formats as allowed by DOI API, see https://citation.crosscite.org/docs.html#sec-4. + + Returns: + List of citations in the requested format. """ result_df = self._safe_filter_by_selection( self.index, @@ -1224,13 +1275,19 @@ def citations_from_selection( for doi in distinct_dois: url = "https://dx.doi.org/" + doi + logger.debug(f"Requesting citation for DOI: {doi}") + response = requests.get(url, headers=headers, timeout=timeout) + logger.debug("Received response: " + str(response.status_code)) + if response.status_code == 200: if citation_format == self.CITATION_FORMAT_JSON: citations.append(response.json()) else: citations.append(response.text) + logger.debug("Received citation: " + citations[-1]) + else: logger.error(f"Failed to get citation for DOI: {url}") diff --git a/tests/idcindex.py b/tests/idcindex.py index 7f33f5f6..50757f1d 100644 --- a/tests/idcindex.py +++ b/tests/idcindex.py @@ -370,6 +370,9 @@ def test_citations(self): ) self.assertIsNotNone(citations) + citations = self.client.citations_from_manifest("./study_manifest_aws.s5cmd") + self.assertIsNotNone(citations) + if __name__ == "__main__": unittest.main()