Skip to content

Commit

Permalink
Merge pull request #86 from fedorov/citations-from-manifest
Browse files Browse the repository at this point in the history
enh: add generation of citations from manifest
  • Loading branch information
fedorov authored May 21, 2024
2 parents 76e5951 + 1b7746d commit 0603347
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 1 deletion.
59 changes: 58 additions & 1 deletion idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1180,6 +1180,54 @@ def download_from_manifest(
list_of_directories=list_of_directories,
)

def citations_from_manifest(
self,
manifestFile: str,
citation_format: str = CITATION_FORMAT_APA,
):
"""Get the list of publications that should be cited/attributed for a cohort defined by a manifest.
Args:
manifestFile (str: string containing the path to the manifest file.
format (str): string containing the format of the citation. Must be one of the following: CITATION_FORMAT_APA, CITATION_FORMAT_BIBTEX, CITATION_FORMAT_JSON. Defaults to CITATION_FORMAT_APA. Can be initialized to the alternative formats as allowed by DOI API, see https://citation.crosscite.org/docs.html#sec-4.
Returns:
List of citations in the requested format.
"""

manifest_df = pd.read_csv(
manifestFile,
comment="#",
skip_blank_lines=True,
header=None,
names=["manifest_line"],
)
uuid_pattern = r"s3://.*/([^/]+)/\*"
manifest_df["crdc_series_uuid"] = manifest_df["manifest_line"].str.extract(
uuid_pattern, expand=False
)
index_copy = self.index[["series_aws_url", "SeriesInstanceUID"]].copy()
index_copy["crdc_series_uuid"] = index_copy["series_aws_url"].str.extract(
uuid_pattern, expand=False
)
query = """
SELECT
SeriesInstanceUID
FROM
index_copy
JOIN
manifest_df
ON
index_copy.crdc_series_uuid = manifest_df.crdc_series_uuid
"""

result_df = self.sql_query(query)

return self.citations_from_selection(
seriesInstanceUID=result_df["SeriesInstanceUID"].tolist(),
citation_format=citation_format,
)

def citations_from_selection(
self,
collection_id=None,
Expand All @@ -1193,9 +1241,12 @@ def citations_from_selection(
Args:
collection_id: string or list of strings containing the values of collection_id to filter by
patientId: string or list of strings containing the values of PatientID to filter by
studyInstanceUID: string or list of strings containing the values of DICOM StudyInstanceUID to filter by
studyInstanceUID (str): string or list of strings containing the values of DICOM StudyInstanceUID to filter by
seriesInstanceUID: string or list of strings containing the values of DICOM SeriesInstanceUID to filter by
format: string containing the format of the citation. Must be one of the following: CITATION_FORMAT_APA, CITATION_FORMAT_BIBTEX, CITATION_FORMAT_JSON. Defaults to CITATION_FORMAT_APA. Can be initialized to the alternative formats as allowed by DOI API, see https://citation.crosscite.org/docs.html#sec-4.
Returns:
List of citations in the requested format.
"""
result_df = self._safe_filter_by_selection(
self.index,
Expand Down Expand Up @@ -1224,13 +1275,19 @@ def citations_from_selection(
for doi in distinct_dois:
url = "https://dx.doi.org/" + doi

logger.debug(f"Requesting citation for DOI: {doi}")

response = requests.get(url, headers=headers, timeout=timeout)

logger.debug("Received response: " + str(response.status_code))

if response.status_code == 200:
if citation_format == self.CITATION_FORMAT_JSON:
citations.append(response.json())
else:
citations.append(response.text)
logger.debug("Received citation: " + citations[-1])

else:
logger.error(f"Failed to get citation for DOI: {url}")

Expand Down
3 changes: 3 additions & 0 deletions tests/idcindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,9 @@ def test_citations(self):
)
self.assertIsNotNone(citations)

citations = self.client.citations_from_manifest("./study_manifest_aws.s5cmd")
self.assertIsNotNone(citations)


if __name__ == "__main__":
unittest.main()

0 comments on commit 0603347

Please sign in to comment.