downloader

monarch-initiative · Jan 14, 2024 · c40b5ee · c40b5ee
1 parent c2273d4
commit c40b5ee
Show file tree

Hide file tree

Showing 9 changed files with 1,476 additions and 53 deletions.
diff --git a/docs/explanations/cda_disease.md b/docs/explanations/cda_disease.md
@@ -7,18 +7,18 @@ We extract information about the disease diagnosis from two CDA tables, `diagnos
 ## diagnosis
 
 
-| Column          | Example        | Explanation |
-|:----------------|:---------------|:----------------|
-| diagnosis_id | CGCI-HTMCP-CC.HTMCP-03-06-02424.HTMCP-03-06-02424_diagnosis| y |
-| diagnosis_identifier | see below | y |
-| primary_diagnosis | Squamous cell carcinoma, keratinizing, NOS | y |
-| age_at_diagnosis | 13085.0 | y |
-| morphology | 8071/3 | y |
-| stage | None | y |
-| grade | G3 | y |
-| method_of_diagnosis | Biopsy | y |
-| subject_id | CGCI.HTMCP-03-06-02424 | y |
-| researchsubject_id | CGCI-HTMCP-CC.HTMCP-03-06-02424| y |
+| Column               | Example                                                     | Explanation |
+|:---------------------|:------------------------------------------------------------|:------------|
+| diagnosis_id         | CGCI-HTMCP-CC.HTMCP-03-06-02424.HTMCP-03-06-02424_diagnosis | y           |
+| diagnosis_identifier | see below                                                   | y           |
+| primary_diagnosis    | Squamous cell carcinoma, keratinizing, NOS                  | y           |
+| age_at_diagnosis     | 13085.0                                                     | y           |
+| morphology           | 8071/3                                                      | y           |
+| stage                | None                                                        | y           |
+| grade                | G3                                                          | y           |
+| method_of_diagnosis  | Biopsy                                                      | y           |
+| subject_id           | CGCI.HTMCP-03-06-02424                                      | y           |
+| researchsubject_id   | CGCI-HTMCP-CC.HTMCP-03-06-02424                             | y           |
 
 
 The fields of the table have the following meaning.
@@ -57,14 +57,14 @@ Identifier for the researchsubject (which can be a sample or an individaul - Que
 ## researchsubject
 
 
-| Column          | Example        | Explanation |
-|:----------------|:---------------|:----------------|
-| researchsubject_id | CPTAC-3.C3L-00563 | y |
-|  researchsubject_identifier     | see below | y |
-|   member_of_research_project    | CPTAC-3 | y |
-|  primary_diagnosis_condition     | Adenomas and Adenocarcinomas | y |
-|  primary_diagnosis_site     | Uterus, NOS  | y |
-|   subject_id    | CPTAC.C3L-00563 | y |
+| Column                         | Example                      | Explanation |
+|:-------------------------------|:-----------------------------|:------------|
+| researchsubject_id             | CPTAC-3.C3L-00563            | y           |
+| researchsubject_identifier     | see below                    | y           |
+| member_of_research_project     | CPTAC-3                      | y           |
+| primary_diagnosis_condition    | Adenomas and Adenocarcinomas | y           |
+| primary_diagnosis_site         | Uterus, NOS                  | y           |
+| subject_id                     | CPTAC.C3L-00563              | y           |
 
 
 - researchsubject_id

diff --git a/src/oncoexporter/cda/__init__.py b/src/oncoexporter/cda/__init__.py
@@ -1,4 +1,5 @@
 from .cda_disease_factory import CdaDiseaseFactory
+from .cda_factory import CdaFactory
 from .cda_individual_factory import CdaIndividualFactory
 from .cda_biosample_factory import CdaBiosampleFactory
 from .cda_mutation_factory import CdaMutationFactory
@@ -9,6 +10,7 @@
 __version__ = "0.0.2"
 
 __all__ = [
+"CdaFactory",
  "CdaDiseaseFactory",
  "CdaIndividualFactory",
  "CdaBiosampleFactory",

diff --git a/src/oncoexporter/cda/cda_disease_factory.py b/src/oncoexporter/cda/cda_disease_factory.py
@@ -3,7 +3,7 @@
 import phenopackets as PPkt
 import pandas as pd
 import os
-import pkg_resources
+
 import requests
 import csv
 import warnings
@@ -58,37 +58,26 @@ def __init__(self, op_mapper:OpMapper=None,
             self._opMapper = OpDiagnosisMapper()
         else:
             self._opMapper = op_mapper
-        self._icdo_to_ncit = self.load_icdo_to_ncit_tsv()
+       # self._icdo_to_ncit = self.load_icdo_to_ncit_tsv()
             #self._download_and_icdo_to_ncit_tsv(icdo_to_ncit_map_url, key_column=key_column)
 
 
     def load_icdo_to_ncit_tsv(self, overwrite:bool=False, local_dir:str=None):
         """
-        Download if necessary the NCIT ICD-O mapping file and store it in the package downloaded_files folder
+        Download if necessary the NCIT ICD-O mapping file and store it in the package ncit_files folder
         :param overwrite: whether to overwrite an existing file (otherwise we skip downloading)
         :type overwrite: bool
         :param local_dir: Path to a directory to write downloaded file
         """
-        icdo_to_ncit_map_url = 'https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Mappings/ICD-O-3_Mappings/ICD-O-3.1-NCIt_Morphology_Mapping.txt',
         key_column = 'ICD-O Code'
-        local_dir = self.get_local_share_directory()
-        icd_path = os.path.join(local_dir, 'ICD-O-3.1-NCIt_Morphology_Mapping.txt')
-        if not os.path.isfile(icd_path):
-            print(f"Downloading {icdo_to_ncit_map_url}")
-            response = requests.get(icdo_to_ncit_map_url)
-            response.raise_for_status()  # This will raise an error if the download failed
-            tsv_data = csv.DictReader(response.text.splitlines(), delimiter='\t')
-            with open(icd_path, 'w', newline='\n') as f:
-                writer = csv.writer(f)
-                writer.writerows(tsv_data)
-            print(f"Downloaded {icdo_to_ncit_map_url}")
         # When we get here, either we have just downloaded the ICD-O file or it was already available locally.
-        stream = pkg_resources.resource_stream(icd_path)
-        df = pd.read_csv(stream, encoding='latin-1')
+        # stream = pkg_resources.resource_stream("", icd_path)
+        icd_path = self._icdo_to_ncit_path
+        df = pd.read_csv(icd_path, encoding='latin-1')
         result_dict = {}
         if key_column not in df.columns:
             raise ValueError(f"Couldn't find key_column {key_column} in fieldnames "
-                          f"{df.columns} of file downloaded from {icdo_to_ncit_map_url}")
+                          f"{df.columns} of file at {icd_path}")
         for idx, row in df.iterrows():
             key = row[key_column]
             if key:
@@ -117,6 +106,9 @@ def _download_and_icdo_to_ncit_tsv(self, url: str, key_column: str) -> dict:
 
         return result_dict
 
+    def temp_get_load_icdo_to_ncit_tsv(self):
+        return self._icdo_to_ncit
+
     def to_ga4gh(self, row):
         """Convert a row from the CDA subject table into an Individual message (GA4GH Phenopacket Schema)
 

diff --git a/src/oncoexporter/cda/cda_downloader.py b/src/oncoexporter/cda/cda_downloader.py
@@ -0,0 +1,60 @@
+import os
+import platform
+import requests
+import csv
+import warnings
+from importlib_resources import files
+
+
+class CdaDownloader:
+
+    def __init__(self):
+        self.get_ncit_neoplasm_core()
+
+    def download_if_needed(self, overwrite_downloads:bool):
+        local_dir = self.get_local_share_directory()
+        self._icdo_to_ncit_path = None
+        self.load_icdo_to_ncit_tsv(overwrite=overwrite_downloads, local_dir=local_dir)
+
+    def get_icdo_to_ncit_path(self):
+        return self._icdo_to_ncit_path
+
+    def load_icdo_to_ncit_tsv(self, overwrite:bool, local_dir:str):
+        """
+        Download if necessary the NCIT ICD-O mapping file and store it in the package ncit_files folder
+        :param overwrite: whether to overwrite an existing file (otherwise we skip downloading)
+        :type overwrite: bool
+        :param local_dir: Path to a directory to write downloaded file
+        """
+        icdo_to_ncit_map_url = 'https://evs.nci.nih.gov/ftp1/NCI_Thesaurus/Mappings/ICD-O-3_Mappings/ICD-O-3.1-NCIt_Morphology_Mapping.txt'
+        local_dir = self.get_local_share_directory()
+        icd_path = os.path.join(local_dir, 'ICD-O-3.1-NCIt_Morphology_Mapping.txt')
+        if not os.path.isfile(icd_path):
+            print(f"[INFO] Downloading {icdo_to_ncit_map_url}")
+            response = requests.get(icdo_to_ncit_map_url)
+            response.raise_for_status()  # This will raise an error if the download failed
+            tsv_data = csv.DictReader(response.text.splitlines(), delimiter='\t')
+            with open(icd_path, 'w', newline='\n') as f:
+                writer = csv.DictWriter(f, fieldnames=tsv_data.fieldnames)
+                writer.writeheader()
+                for row in tsv_data:
+                    writer.writerow(row)
+            print(f"[INFO] Downloaded {icdo_to_ncit_map_url}")
+        self._icdo_to_ncit_path = icd_path
+
+
+
+    def get_ncit_neoplasm_core(self):
+        # Reads contents with UTF-8 encoding and returns str.
+        neo_core = files('oncoexporter.ncit_files').joinpath('Neoplasm_Core.csv').read_text()
+        print("NEO CORE", neo_core)
+
+    def get_local_share_directory(self, local_dir=None):
+        my_platform = platform.platform()
+        my_system = platform.system()
+        if local_dir is None:
+            local_dir = os.path.join(os.path.expanduser('~'), ".oncoexporter")
+        if not os.path.exists(local_dir):
+            os.makedirs(local_dir)
+            print(f"[INFO] Created new directory for oncoexporter at {local_dir}")
+        return local_dir
diff --git a/src/oncoexporter/cda/cda_factory.py b/src/oncoexporter/cda/cda_factory.py
@@ -3,14 +3,17 @@
 import os
 
 import pandas as pd
-
+from .cda_downloader import CdaDownloader
 
 class CdaFactory(metaclass=abc.ABCMeta):
     """Superclass for the CDA Factory Classes
 
     Each subclass must implement the to_ga4gh method, which transforms a row of a table from CDA to a GA4GH Message.
     """
-
+    def __init__(self, overwrite_downloads:bool=False):
+        downloader = CdaDownloader()
+        downloader.download_if_needed(overwrite_downloads)
+        self._icdo_to_ncit_path = downloader.get_icdo_to_ncit_path()
 
     @abc.abstractmethod
     def to_ga4gh(self, row:pd.Series):
@@ -55,17 +58,11 @@ def days_to_iso(days: int):
     def get_local_share_directory(self, local_dir=None):
         my_platform = platform.platform()
         my_system = platform.system()
-        if local_dir is None and ("macOS" in my_platform or "Darwin" in my_system):
-            local_dir = os.path.expanduser('~')
-        if local_dir is None and ("linux" in my_platform or "Linux" in my_system):
-            local_dir = os.path.expanduser('~')
-        if local_dir is None and my_platform == "Windows":
-            local_dir = os.path.expanduser('~')
         if local_dir is None:
-            raise ValueError("Could not create local directory to store downloaded files for oncoexporter. Specify the local_dir argument to fix this")
+            local_dir = os.path.join(os.path.expanduser('~'), ".oncoexporter")
         if not os.path.exists(local_dir):
             os.makedirs(local_dir)
-            print(f"Created new directory for oncoexporter at {local_dir}")
+            print(f"[INFO] Created new directory for oncoexporter at {local_dir}")
         return local_dir
 
 
diff --git a/src/oncoexporter/downloaded_files/README.md b/src/oncoexporter/downloaded_files/README.md
diff --git a/src/oncoexporter/model/op_mutation.py b/src/oncoexporter/model/op_mutation.py
@@ -5,7 +5,7 @@
 
 class OpMutation(OpMessage):
 
-    def __init__(self,cda_subject_id=None, primary_site=None, Hugo_Symbol=None, Entrez_Gene_Id=None, NCBI_Build=None, Chromosome=None,
+    def __init__(self, cda_subject_id=None, primary_site=None, Hugo_Symbol=None, Entrez_Gene_Id=None, NCBI_Build=None, Chromosome=None,
         Start_Position=None, End_Position=None, Reference_Allele=None, Tumor_Seq_Allele1=None, Tumor_Seq_Allele2=None, dbSNP_RS=None,
         dbSNP_Val_Status=None, Match_Norm_Seq_Allele1=None, Match_Norm_Seq_Allele2=None, Tumor_Validation_Allele1=None,
         Tumor_Validation_Allele2=None, Match_Norm_Validation_Allele1=None, Match_Norm_Validation_Allele2=None,