Skip to content

Commit

Permalink
downloader
Browse files Browse the repository at this point in the history
  • Loading branch information
pnrobinson committed Jan 14, 2024
1 parent c2273d4 commit c40b5ee
Show file tree
Hide file tree
Showing 9 changed files with 1,476 additions and 53 deletions.
40 changes: 20 additions & 20 deletions docs/explanations/cda_disease.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,18 @@ We extract information about the disease diagnosis from two CDA tables, `diagnos
## diagnosis


| Column | Example | Explanation |
|:----------------|:---------------|:----------------|
| diagnosis_id | CGCI-HTMCP-CC.HTMCP-03-06-02424.HTMCP-03-06-02424_diagnosis| y |
| diagnosis_identifier | see below | y |
| primary_diagnosis | Squamous cell carcinoma, keratinizing, NOS | y |
| age_at_diagnosis | 13085.0 | y |
| morphology | 8071/3 | y |
| stage | None | y |
| grade | G3 | y |
| method_of_diagnosis | Biopsy | y |
| subject_id | CGCI.HTMCP-03-06-02424 | y |
| researchsubject_id | CGCI-HTMCP-CC.HTMCP-03-06-02424| y |
| Column | Example | Explanation |
|:---------------------|:------------------------------------------------------------|:------------|
| diagnosis_id | CGCI-HTMCP-CC.HTMCP-03-06-02424.HTMCP-03-06-02424_diagnosis | y |
| diagnosis_identifier | see below | y |
| primary_diagnosis | Squamous cell carcinoma, keratinizing, NOS | y |
| age_at_diagnosis | 13085.0 | y |
| morphology | 8071/3 | y |
| stage | None | y |
| grade | G3 | y |
| method_of_diagnosis | Biopsy | y |
| subject_id | CGCI.HTMCP-03-06-02424 | y |
| researchsubject_id | CGCI-HTMCP-CC.HTMCP-03-06-02424 | y |


The fields of the table have the following meaning.
Expand Down Expand Up @@ -57,14 +57,14 @@ Identifier for the researchsubject (which can be a sample or an individaul - Que
## researchsubject


| Column | Example | Explanation |
|:----------------|:---------------|:----------------|
| researchsubject_id | CPTAC-3.C3L-00563 | y |
| researchsubject_identifier | see below | y |
| member_of_research_project | CPTAC-3 | y |
| primary_diagnosis_condition | Adenomas and Adenocarcinomas | y |
| primary_diagnosis_site | Uterus, NOS | y |
| subject_id | CPTAC.C3L-00563 | y |
| Column | Example | Explanation |
|:-------------------------------|:-----------------------------|:------------|
| researchsubject_id | CPTAC-3.C3L-00563 | y |
| researchsubject_identifier | see below | y |
| member_of_research_project | CPTAC-3 | y |
| primary_diagnosis_condition | Adenomas and Adenocarcinomas | y |
| primary_diagnosis_site | Uterus, NOS | y |
| subject_id | CPTAC.C3L-00563 | y |


- researchsubject_id
Expand Down
2 changes: 2 additions & 0 deletions src/oncoexporter/cda/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .cda_disease_factory import CdaDiseaseFactory
from .cda_factory import CdaFactory
from .cda_individual_factory import CdaIndividualFactory
from .cda_biosample_factory import CdaBiosampleFactory
from .cda_mutation_factory import CdaMutationFactory
Expand All @@ -9,6 +10,7 @@
__version__ = "0.0.2"

__all__ = [
"CdaFactory",
"CdaDiseaseFactory",
"CdaIndividualFactory",
"CdaBiosampleFactory",
Expand Down
28 changes: 10 additions & 18 deletions src/oncoexporter/cda/cda_disease_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import phenopackets as PPkt
import pandas as pd
import os
import pkg_resources

import requests
import csv
import warnings
Expand Down Expand Up @@ -58,37 +58,26 @@ def __init__(self, op_mapper:OpMapper=None,
self._opMapper = OpDiagnosisMapper()
else:
self._opMapper = op_mapper
self._icdo_to_ncit = self.load_icdo_to_ncit_tsv()
# self._icdo_to_ncit = self.load_icdo_to_ncit_tsv()
#self._download_and_icdo_to_ncit_tsv(icdo_to_ncit_map_url, key_column=key_column)


def load_icdo_to_ncit_tsv(self, overwrite:bool=False, local_dir:str=None):
"""
Download if necessary the NCIT ICD-O mapping file and store it in the package downloaded_files folder
Download if necessary the NCIT ICD-O mapping file and store it in the package ncit_files folder
:param overwrite: whether to overwrite an existing file (otherwise we skip downloading)
:type overwrite: bool
:param local_dir: Path to a directory to write downloaded file
"""
icdo_to_ncit_map_url = 'https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Mappings/ICD-O-3_Mappings/ICD-O-3.1-NCIt_Morphology_Mapping.txt',
key_column = 'ICD-O Code'
local_dir = self.get_local_share_directory()
icd_path = os.path.join(local_dir, 'ICD-O-3.1-NCIt_Morphology_Mapping.txt')
if not os.path.isfile(icd_path):
print(f"Downloading {icdo_to_ncit_map_url}")
response = requests.get(icdo_to_ncit_map_url)
response.raise_for_status() # This will raise an error if the download failed
tsv_data = csv.DictReader(response.text.splitlines(), delimiter='\t')
with open(icd_path, 'w', newline='\n') as f:
writer = csv.writer(f)
writer.writerows(tsv_data)
print(f"Downloaded {icdo_to_ncit_map_url}")
# When we get here, either we have just downloaded the ICD-O file or it was already available locally.
stream = pkg_resources.resource_stream(icd_path)
df = pd.read_csv(stream, encoding='latin-1')
# stream = pkg_resources.resource_stream("", icd_path)
icd_path = self._icdo_to_ncit_path
df = pd.read_csv(icd_path, encoding='latin-1')
result_dict = {}
if key_column not in df.columns:
raise ValueError(f"Couldn't find key_column {key_column} in fieldnames "
f"{df.columns} of file downloaded from {icdo_to_ncit_map_url}")
f"{df.columns} of file at {icd_path}")
for idx, row in df.iterrows():
key = row[key_column]
if key:
Expand Down Expand Up @@ -117,6 +106,9 @@ def _download_and_icdo_to_ncit_tsv(self, url: str, key_column: str) -> dict:

return result_dict

def temp_get_load_icdo_to_ncit_tsv(self):
return self._icdo_to_ncit

def to_ga4gh(self, row):
"""Convert a row from the CDA subject table into an Individual message (GA4GH Phenopacket Schema)
Expand Down
60 changes: 60 additions & 0 deletions src/oncoexporter/cda/cda_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import os
import platform
import requests
import csv
import warnings
from importlib_resources import files


class CdaDownloader:

def __init__(self):
self.get_ncit_neoplasm_core()

def download_if_needed(self, overwrite_downloads:bool):
local_dir = self.get_local_share_directory()
self._icdo_to_ncit_path = None
self.load_icdo_to_ncit_tsv(overwrite=overwrite_downloads, local_dir=local_dir)

def get_icdo_to_ncit_path(self):
return self._icdo_to_ncit_path

def load_icdo_to_ncit_tsv(self, overwrite:bool, local_dir:str):
"""
Download if necessary the NCIT ICD-O mapping file and store it in the package ncit_files folder
:param overwrite: whether to overwrite an existing file (otherwise we skip downloading)
:type overwrite: bool
:param local_dir: Path to a directory to write downloaded file
"""
icdo_to_ncit_map_url = 'https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Mappings/ICD-O-3_Mappings/ICD-O-3.1-NCIt_Morphology_Mapping.txt'
local_dir = self.get_local_share_directory()
icd_path = os.path.join(local_dir, 'ICD-O-3.1-NCIt_Morphology_Mapping.txt')
if not os.path.isfile(icd_path):
print(f"[INFO] Downloading {icdo_to_ncit_map_url}")
response = requests.get(icdo_to_ncit_map_url)
response.raise_for_status() # This will raise an error if the download failed
tsv_data = csv.DictReader(response.text.splitlines(), delimiter='\t')
with open(icd_path, 'w', newline='\n') as f:
writer = csv.DictWriter(f, fieldnames=tsv_data.fieldnames)
writer.writeheader()
for row in tsv_data:
writer.writerow(row)
print(f"[INFO] Downloaded {icdo_to_ncit_map_url}")
self._icdo_to_ncit_path = icd_path



def get_ncit_neoplasm_core(self):
# Reads contents with UTF-8 encoding and returns str.
neo_core = files('oncoexporter.ncit_files').joinpath('Neoplasm_Core.csv').read_text()
print("NEO CORE", neo_core)

def get_local_share_directory(self, local_dir=None):
my_platform = platform.platform()
my_system = platform.system()
if local_dir is None:
local_dir = os.path.join(os.path.expanduser('~'), ".oncoexporter")
if not os.path.exists(local_dir):
os.makedirs(local_dir)
print(f"[INFO] Created new directory for oncoexporter at {local_dir}")
return local_dir
17 changes: 7 additions & 10 deletions src/oncoexporter/cda/cda_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,17 @@
import os

import pandas as pd

from .cda_downloader import CdaDownloader

class CdaFactory(metaclass=abc.ABCMeta):
"""Superclass for the CDA Factory Classes
Each subclass must implement the to_ga4gh method, which transforms a row of a table from CDA to a GA4GH Message.
"""

def __init__(self, overwrite_downloads:bool=False):
downloader = CdaDownloader()
downloader.download_if_needed(overwrite_downloads)
self._icdo_to_ncit_path = downloader.get_icdo_to_ncit_path()

@abc.abstractmethod
def to_ga4gh(self, row:pd.Series):
Expand Down Expand Up @@ -55,17 +58,11 @@ def days_to_iso(days: int):
def get_local_share_directory(self, local_dir=None):
my_platform = platform.platform()
my_system = platform.system()
if local_dir is None and ("macOS" in my_platform or "Darwin" in my_system):
local_dir = os.path.expanduser('~')
if local_dir is None and ("linux" in my_platform or "Linux" in my_system):
local_dir = os.path.expanduser('~')
if local_dir is None and my_platform == "Windows":
local_dir = os.path.expanduser('~')
if local_dir is None:
raise ValueError("Could not create local directory to store downloaded files for oncoexporter. Specify the local_dir argument to fix this")
local_dir = os.path.join(os.path.expanduser('~'), ".oncoexporter")
if not os.path.exists(local_dir):
os.makedirs(local_dir)
print(f"Created new directory for oncoexporter at {local_dir}")
print(f"[INFO] Created new directory for oncoexporter at {local_dir}")
return local_dir


4 changes: 0 additions & 4 deletions src/oncoexporter/downloaded_files/README.md

This file was deleted.

2 changes: 1 addition & 1 deletion src/oncoexporter/model/op_mutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class OpMutation(OpMessage):

def __init__(self,cda_subject_id=None, primary_site=None, Hugo_Symbol=None, Entrez_Gene_Id=None, NCBI_Build=None, Chromosome=None,
def __init__(self, cda_subject_id=None, primary_site=None, Hugo_Symbol=None, Entrez_Gene_Id=None, NCBI_Build=None, Chromosome=None,
Start_Position=None, End_Position=None, Reference_Allele=None, Tumor_Seq_Allele1=None, Tumor_Seq_Allele2=None, dbSNP_RS=None,
dbSNP_Val_Status=None, Match_Norm_Seq_Allele1=None, Match_Norm_Seq_Allele2=None, Tumor_Validation_Allele1=None,
Tumor_Validation_Allele2=None, Match_Norm_Validation_Allele1=None, Match_Norm_Validation_Allele2=None,
Expand Down
Loading

0 comments on commit c40b5ee

Please sign in to comment.