Merge branch 'develop'

monarch-initiative · Sep 24, 2023 · 2d83325 · 2d83325
2 parents 043cafd + 9b8048b
commit 2d83325
Show file tree

Hide file tree

Showing 17 changed files with 169 additions and 81 deletions.
diff --git a/notebooks/BuildingACohort.ipynb b/notebooks/BuildingACohort.ipynb
@@ -736,7 +736,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "Tsite = Q('treatment_anatomic_site = \"Cervix\"')"
+    "Tsite = Q('treatment_anatomic_site = \"Cervix\"', )"
    ]
   },
   {

diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/oncoexporter/__init__.py b/src/oncoexporter/__init__.py
@@ -2,7 +2,7 @@
 oncoexporter is a library for transforming National Cancer Institute (NCI) data into phenopackets.
 """
 
-__version__ = '0.0.3'
+__version__ = '0.0.4'
 
 from .cda.cda_individual_factory import CdaIndividualFactory
 

diff --git a/src/oncoexporter/cda/.gitignore b/src/oncoexporter/cda/.gitignore
@@ -0,0 +1 @@
+*.pkl
diff --git a/src/oncoexporter/cda/__init__.py b/src/oncoexporter/cda/__init__.py
@@ -1,14 +1,18 @@
 from .cda_disease_factory import CdaDiseaseFactory
 from .cda_individual_factory import CdaIndividualFactory
+from .cda_biosample import CdaBiosampleFactory
 from .cda_mutation_factory import CdaMutationFactory
 from .cda_table_importer import CdaTableImporter
+from .cda_medicalaction_factory import make_cda_medicalaction
 
 
 __version__ = "0.0.2"
 
 __all__ = [
  "CdaDiseaseFactory",
  "CdaIndividualFactory",
+ "CdaBiosampleFactory",
  "CdaMutationFactory",
- "CdaTableImporter"
+ "CdaTableImporter",
+ "make_cda_medicalaction"
 ]
diff --git a/src/oncoexporter/model/op_biosample.py → src/oncoexporter/cda/cda_biosample.py b/src/oncoexporter/model/op_biosample.py → src/oncoexporter/cda/cda_biosample.py
@@ -1,8 +1,9 @@
 import typing
 
-import pandas as pd
 import phenopackets as pp
 
+from .cda_factory import CdaFactory
+
 HOMO_SAPIENS = pp.OntologyClass(id='NCBITaxon:9606', label='Homo sapiens')
 LUNG = pp.OntologyClass(id='UBERON:0002048', label='lung')
 
@@ -23,45 +24,49 @@
 SLIDE = pp.OntologyClass(id='NCIT:C165218', label='Diagnostic Slide')
 
 
-def make_cda_biosample(row: pd.Series) -> pp.Biosample:
-    biosample = pp.Biosample()
+class CdaBiosampleFactory(CdaFactory):
+    """
+    Class for creating a `Biosample` element from a row of the `specimen` CDA table.
+    """
 
-    biosample.id = row['specimen_id']
+    def from_cancer_data_aggregator(self, row) -> pp.Biosample:
+        biosample = pp.Biosample()
 
-    derived_from_subj = row['derived_from_subject']
-    if derived_from_subj is not None:
-        biosample.individual_id = derived_from_subj
+        biosample.id = row['specimen_id']
 
-    # derived_from_specimen -> derived_from_id
-    derived_from = row['derived_from_specimen']
-    if derived_from is not None:
-        if derived_from == 'initial specimen':
-            biosample.derived_from_id = derived_from_subj
-        else:
-            biosample.derived_from_id = derived_from
+        derived_from_subj = row['derived_from_subject']
+        if derived_from_subj is not None:
+            biosample.individual_id = derived_from_subj
 
-    # anatomical_site -> sampled_tissue
-    sampled_tissue = _map_anatomical_site(row['anatomical_site'])
-    if sampled_tissue is not None:
-        biosample.sampled_tissue.CopyFrom(sampled_tissue)
+        # derived_from_specimen -> derived_from_id
+        derived_from = row['derived_from_specimen']
+        if derived_from is not None:
+            if derived_from == 'initial specimen':
+                biosample.derived_from_id = derived_from_subj
+            else:
+                biosample.derived_from_id = derived_from
 
-    sample_type = _map_specimen_type(row['specimen_type'])
-    if sample_type is not None:
-        biosample.sample_type.CopyFrom(sample_type)
+        # anatomical_site -> sampled_tissue
+        sampled_tissue = _map_anatomical_site(row['anatomical_site'])
+        if sampled_tissue is not None:
+            biosample.sampled_tissue.CopyFrom(sampled_tissue)
 
-    biosample.taxonomy.CopyFrom(HOMO_SAPIENS)
+        sample_type = _map_specimen_type(row['specimen_type'])
+        if sample_type is not None:
+            biosample.sample_type.CopyFrom(sample_type)
 
-    # primary_disease_type -> histological_diagnosis
-    histological_diagnosis = _map_primary_disease_type(row['primary_disease_type'])
-    if histological_diagnosis is not None:
-        biosample.histological_diagnosis.CopyFrom(histological_diagnosis)
+        biosample.taxonomy.CopyFrom(HOMO_SAPIENS)
 
-    material_sample = _map_source_material_type(row['source_material_type'])
-    if material_sample is not None:
-        biosample.material_sample.CopyFrom(material_sample)
+        # primary_disease_type -> histological_diagnosis
+        histological_diagnosis = _map_primary_disease_type(row['primary_disease_type'])
+        if histological_diagnosis is not None:
+            biosample.histological_diagnosis.CopyFrom(histological_diagnosis)
 
+        material_sample = _map_source_material_type(row['source_material_type'])
+        if material_sample is not None:
+            biosample.material_sample.CopyFrom(material_sample)
 
-    return biosample
+        return biosample
 
 
 def _map_anatomical_site(val: typing.Optional[str]) -> typing.Optional[pp.OntologyClass]:

diff --git a/src/oncoexporter/cda/cda_disease_factory.py b/src/oncoexporter/cda/cda_disease_factory.py
@@ -1,8 +1,8 @@
 import phenopackets as PPkt
 import pandas as pd
 
-
-from ..cda.mapper.op_diagnosis_mapper import OpDiagnosisMapper
+from .mapper.op_mapper import OpMapper
+from .mapper.op_diagnosis_mapper import OpDiagnosisMapper
 from .cda_factory import CdaFactory
 
 

diff --git a/src/oncoexporter/cda/cda_individual_factory.py b/src/oncoexporter/cda/cda_individual_factory.py
@@ -1,7 +1,7 @@
 import phenopackets as PPkt
 import pandas as pd
 
-from ..model.op_Individual import OpIndividual
+from oncoexporter.model.op_Individual import OpIndividual
 from  .mapper.op_cause_of_death_mapper import OpCauseOfDeathMapper
 from .cda_factory import CdaFactory
 
@@ -17,7 +17,6 @@ def __init__(self) -> None:
         super().__init__()
         self._cause_of_death_mapper = OpCauseOfDeathMapper()
 
-
     @staticmethod
     def days_to_iso(days: int):
         """

diff --git a/medicalaction.py → ...exporter/cda/cda_medicalaction_factory.py b/medicalaction.py → ...exporter/cda/cda_medicalaction_factory.py
@@ -26,19 +26,27 @@
 Not_Otherwise_Specified = pp.OntologyClass(id='NCIT:C19594', label='Not Otherwise Specified')
 UNKNOWN = pp.OntologyClass(id='NCIT:C17998', label='Unknown')
 
+
+
+
+
 def make_cda_medicalaction(row: pd.Series) -> pp.MedicalAction:
     medicalaction = pp.MedicalAction()
 
-    treatment = pp.Treatment()
+    treatment_type = row["treatment_type"]
+    if treatment_type == "Chemotherapy":
+        # Use the GA4GHTreatment message
+        # therapeutic_agent -> treatment agent
+        treatment_agent = _map_therapeutic_agent(row['therapeutic_agent'])
+        # radiation = _map_radiation(row['therapeutic_agent'])
+        if treatment_agent is not None:
+            treatment = pp.Treatment()
+            treatment.agent.CopyFrom(treatment_agent)
+            medicalaction.treatment.CopyFrom(treatment)
+    elif "Radiation Therapy, NOS" == treatment_type:
+        ## Use GA4GH RadiationTherapy object
+        pass
 
-    # therapeutic_agent -> treatment agent
-    treatment_agent = _map_therapeutic_agent(row['therapeutic_agent'])
-    if treatment_agent is not None:
-        treatment.agent.CopyFrom(treatment_agent)
-
-    action = pp.OntologyClass()
-    action.CopyFrom(treatment)
-    medicalaction.action.CpyForm(action)
 
     # treatment_outcome -> response_to_treatment
     response_to_treatment = _map_response_to_treatment(row['treatment_outcome'])
@@ -47,7 +55,8 @@ def make_cda_medicalaction(row: pd.Series) -> pp.MedicalAction:
 
     return medicalaction
 
-def _map_therapeutic_agent(val: typing.Optional[str]=None) -> typing.Optional[pp.OntologyClass]:
+
+def _map_response_to_treatment(val: typing.Optional[str]=None) -> typing.Optional[pp.OntologyClass]:
     if val is not None:
         val = val.lower()
         if val == "progressive disease":

diff --git a/src/oncoexporter/cda/cda_table_importer.py b/src/oncoexporter/cda/cda_table_importer.py
@@ -1,18 +1,22 @@
-from cdapython import (
-    Q
-)
+import os.path
+
+from cdapython import Q
 import phenopackets as PPkt
 import typing
 import pandas as pd
+import pickle
 from . import CdaDiseaseFactory
 from .cda_importer import CdaImporter
 from .cda_individual_factory import CdaIndividualFactory
+from .cda_biosample import CdaBiosampleFactory
+from .cda_mutation_factory import CdaMutationFactory
+from .cda_medicalaction_factory import make_cda_medicalaction
 from tqdm import tqdm
 
 class CdaTableImporter(CdaImporter):
 
 
-    def __init__(self, query:str=None, query_obj:Q=None):
+    def __init__(self, query:str=None, query_obj:Q=None, use_cache=False):
         """
         :param query: A query for CDA such as 'primary_diagnosis_site = "Lung"'
 
@@ -29,38 +33,114 @@ def __init__(self, query:str=None, query_obj:Q=None):
         else:
             raise ValueError("Need to pass either query or query_obj argument but not both")
         self._ppackt_d = {} # key -- patient ID, value: PPkt.Phenopacket
+        self._use_cache = use_cache
+
+    def get_diagnosis_df(self, callable, cache_name: str):
+        print(f"Retrieving dataframe {cache_name}")
+        if self._use_cache and os.path.isfile(cache_name):
+            with open(cache_name, 'rb') as cachehandle:
+                print(f"loading cached dataframe from {cache_name}")
+                individual_df = pickle.load(cachehandle)
+        else:
+            print(f"calling CDA function")
+            individual_df = callable()
+            if self._use_cache:
+                print(f"Creating cached dataframe as {cache_name}")
+                with open(cache_name, 'wb') as f:
+                    pickle.dump(individual_df, f)
+        return individual_df
 
     def get_ga4gh_phenopackets(self) -> typing.List[PPkt.Phenopacket]:
         """
 
         1.
 
         """
+        subject_id_to_interpretation = {}
+
         individual_factory = CdaIndividualFactory()
-        individual_df = self._query.subject.run().get_all().to_dataframe()
-        for idx, row in tqdm(individual_df.iterrows(), total=len(individual_df), desc="individual messages"):
-        #for idx, row in individual_df.iterrows():
+        callable = lambda: self._query.subject.run().get_all().to_dataframe()
+        print("getting individual_df")
+        individual_df = self.get_diagnosis_df(callable, "individual_df.pkl")
+        print("obtained individual_df")
+        diagnosis_callable = lambda: self._query.diagnosis.run().get_all().to_dataframe()
+        diagnosis_df = self.get_diagnosis_df(diagnosis_callable, "diagnosis_df.pkl")
+        print("obtained diagnosis_df")
+        rsub_callable = lambda: self._query.researchsubject.run().get_all().to_dataframe()
+        rsub_df = self.get_diagnosis_df(rsub_callable, "rsub_df.pkl")
+        print("obtained rsub_df")
+
+        specimen_callable = lambda: self._query.specimen.run().get_all().to_dataframe()
+        specimen_df = self.get_diagnosis_df(specimen_callable, "specimen_df.pkl")
+
+        treatment_callable = lambda: self._query.treatment.run().get_all().to_dataframe()
+        treatment_df = self.get_diagnosis_df(treatment_callable, "treatment_df.pkl")
+
+        mutation_callable = lambda: self._query.mutation.run().get_all().to_dataframe()
+        mutation_df = self.get_diagnosis_df(mutation_callable, "mutation_df.pkl")
+
+        for idx, row in tqdm(individual_df.iterrows(),total=len(individual_df), desc= "individual dataframe"):
             individual_message = individual_factory.from_cancer_data_aggregator(row=row)
-            individual_id = individual_message.id
+            indivudal_id = individual_message.id
+            interpretation = PPkt.Interpretation()
+            interpretation.id = "id"
+            interpretation.progress_status = PPkt.Interpretation.ProgressStatus.SOLVED
+            subject_id_to_interpretation[indivudal_id] = interpretation
             ppackt = PPkt.Phenopacket()
             ppackt.subject.CopyFrom(individual_message)
-            self._ppackt_d[individual_id] = ppackt
-        diagnosis_df = self._query.diagnosis.run().get_all().to_dataframe()
-        rsub_df = self._query.researchsubject.run().get_all().to_dataframe()  # view the dataframe
+            self._ppackt_d[indivudal_id] = ppackt
         merged_df = pd.merge(diagnosis_df, rsub_df, left_on='subject_id', right_on='subject_id',
                              suffixes=["_di", "_rs"])
         disease_factory = CdaDiseaseFactory()
-        for idx, row in tqdm( merged_df.iterrows(), total=len(merged_df), desc="disease messages"):
-        #for idx, row in merged_df.iterrows():
+        for idx, row in tqdm(merged_df.iterrows(), total= len(merged_df.index), desc="merged diagnosis dataframe"):
             disease_message = disease_factory.from_cancer_data_aggregator(row)
             individual_id = row["subject_id"]
+            if individual_id not in subject_id_to_interpretation:
+                raise ValueError(f"Could not find individual id {individual_id} in subject_id_to_disease")
+            subject_id_to_interpretation.get(individual_id).diagnosis.disease.CopyFrom(disease_message.term)
             if individual_id not in self._ppackt_d:
                 raise ValueError(f"Attempt to enter unknown individual ID from disease factory: \"{individual_id}\"")
             self._ppackt_d.get(individual_id).diseases.append(disease_message)
 
+        specimen_factory = CdaBiosampleFactory()
+        for idx, row in tqdm(specimen_df.iterrows(),total= len(specimen_df.index), desc="specimen/biosample dataframe"):
+            biosample_message = specimen_factory.from_cancer_data_aggregator(row)
+            individual_id = row["subject_id"]
+            if individual_id not in self._ppackt_d:
+                raise ValueError(f"Attempt to enter unknown individual ID from biosample factory: \"{individual_id}\"")
+            self._ppackt_d.get(individual_id).biosamples.append(biosample_message)
+
+        mutation_factory = CdaMutationFactory()
+        for idx, row in tqdm(mutation_df.iterrows(), total=len(mutation_df.index), desc="mutation dataframe"):
+            individual_id = row["cda_subject_id"]
+            if individual_id not in subject_id_to_interpretation:
+                raise ValueError(f"Could not find individual id {individual_id} in subject_id_to_interpretation")
+            pp = self._ppackt_d[individual_id]
+            if len(pp.interpretations) == 0:
+                interpretation = PPkt.Interpretation()
+                disease = pp.diseases[0]
+                diagnosis = PPkt.Diagnosis()
+                diagnosis.disease.CopyFrom(disease.term)
+                interpretation.diagnosis.CopyFrom(diagnosis)
+                pp.interpretations.append(interpretation)
+            else:
+                diagnosis = pp.interpretations[0].diagnosis
+            variant_interpretation_message = mutation_factory.from_cancer_data_aggregator(row)
+            genomic_interpretation = PPkt.GenomicInterpretation()
+            # TODO -- CLEAN UP
+            genomic_interpretation.subject_or_biosample_id = row["Tumor_Aliquot_UUID"]
+            # by assumption, variants passed to this package are all causative -- ASK CDA
+            # genomic_interpretation.interpretation_status = PPkt.GenomicInterpretation.InterpretationStatus.CAUSATIVE
+            genomic_interpretation.variant_interpretation.CopyFrom(variant_interpretation_message)
+            diagnosis.genomic_interpretations.append(genomic_interpretation)
+
+        # make_cda_medicalaction
+        for idx, row in tqdm(treatment_df.iterrows(), total=len(treatment_df.index), desc="Treatment DF"):
+            individual_id = row["subject_id"]
+            medical_action_message = make_cda_medicalaction(row)
+            if individual_id not in self._ppackt_d:
+                raise ValueError(f"Attempt to enter unknown individual ID from treatemtn factory: \"{individual_id}\"")
+            self._ppackt_d.get(individual_id).medical_actions.append(medical_action_message)
 
-        spcimen_df = self._query.specimen.run().get_all().to_dataframe()
-        ## get specimen message
-        ## copy to corresponding phenopacket
         return list(self._ppackt_d.values())
 
diff --git a/src/oncoexporter/cda/mapper/op_mapper.py b/src/oncoexporter/cda/mapper/op_mapper.py
@@ -1,6 +1,5 @@
 import abc
 from typing import Optional
-import pandas as pd
 import phenopackets as PPkt
 
 
@@ -11,11 +10,6 @@ class OpMapper(metaclass=abc.ABCMeta):
     Superclass for mapper objects that map input data to Ontology classes.
     Subclasses may create state in the constructor
     """
-    def __init__(self, ncit_obo=None) -> None:
-        # init OAK for NCIT
-        # init OAK for Mondo
-        # etc.
-        pass
 
     @abc.abstractmethod
     def get_ontology_term(self, row) ->Optional[PPkt.OntologyClass]:

diff --git a/src/oncoexporter/model/op_Individual.py b/src/oncoexporter/model/op_Individual.py
@@ -1,5 +1,4 @@
 import phenopackets as PPkt
-import pandas as pd
 from .op_message import OpMessage
 
 class OpIndividual(OpMessage):

diff --git a/src/oncoexporter/model/op_disease.py b/src/oncoexporter/model/op_disease.py
@@ -1,5 +1,3 @@
-import phenopackets as PPkt
-import pandas as pd
 from .op_message import OpMessage
 
 class OpDisease(OpMessage):

diff --git a/tests/__init__.py b/tests/__init__.py