Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
pnrobinson committed Sep 24, 2023
2 parents 043cafd + 9b8048b commit 2d83325
Show file tree
Hide file tree
Showing 17 changed files with 169 additions and 81 deletions.
2 changes: 1 addition & 1 deletion notebooks/BuildingACohort.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -736,7 +736,7 @@
"metadata": {},
"outputs": [],
"source": [
"Tsite = Q('treatment_anatomic_site = \"Cervix\"')"
"Tsite = Q('treatment_anatomic_site = \"Cervix\"', )"
]
},
{
Expand Down
Empty file removed src/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion src/oncoexporter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
oncoexporter is a library for transforming National Cancer Institute (NCI) data into phenopackets.
"""

__version__ = '0.0.3'
__version__ = '0.0.4'

from .cda.cda_individual_factory import CdaIndividualFactory

Expand Down
1 change: 1 addition & 0 deletions src/oncoexporter/cda/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pkl
6 changes: 5 additions & 1 deletion src/oncoexporter/cda/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from .cda_disease_factory import CdaDiseaseFactory
from .cda_individual_factory import CdaIndividualFactory
from .cda_biosample import CdaBiosampleFactory
from .cda_mutation_factory import CdaMutationFactory
from .cda_table_importer import CdaTableImporter
from .cda_medicalaction_factory import make_cda_medicalaction


__version__ = "0.0.2"

__all__ = [
"CdaDiseaseFactory",
"CdaIndividualFactory",
"CdaBiosampleFactory",
"CdaMutationFactory",
"CdaTableImporter"
"CdaTableImporter",
"make_cda_medicalaction"
]
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import typing

import pandas as pd
import phenopackets as pp

from .cda_factory import CdaFactory

HOMO_SAPIENS = pp.OntologyClass(id='NCBITaxon:9606', label='Homo sapiens')
LUNG = pp.OntologyClass(id='UBERON:0002048', label='lung')

Expand All @@ -23,45 +24,49 @@
SLIDE = pp.OntologyClass(id='NCIT:C165218', label='Diagnostic Slide')


def make_cda_biosample(row: pd.Series) -> pp.Biosample:
biosample = pp.Biosample()
class CdaBiosampleFactory(CdaFactory):
"""
Class for creating a `Biosample` element from a row of the `specimen` CDA table.
"""

biosample.id = row['specimen_id']
def from_cancer_data_aggregator(self, row) -> pp.Biosample:
biosample = pp.Biosample()

derived_from_subj = row['derived_from_subject']
if derived_from_subj is not None:
biosample.individual_id = derived_from_subj
biosample.id = row['specimen_id']

# derived_from_specimen -> derived_from_id
derived_from = row['derived_from_specimen']
if derived_from is not None:
if derived_from == 'initial specimen':
biosample.derived_from_id = derived_from_subj
else:
biosample.derived_from_id = derived_from
derived_from_subj = row['derived_from_subject']
if derived_from_subj is not None:
biosample.individual_id = derived_from_subj

# anatomical_site -> sampled_tissue
sampled_tissue = _map_anatomical_site(row['anatomical_site'])
if sampled_tissue is not None:
biosample.sampled_tissue.CopyFrom(sampled_tissue)
# derived_from_specimen -> derived_from_id
derived_from = row['derived_from_specimen']
if derived_from is not None:
if derived_from == 'initial specimen':
biosample.derived_from_id = derived_from_subj
else:
biosample.derived_from_id = derived_from

sample_type = _map_specimen_type(row['specimen_type'])
if sample_type is not None:
biosample.sample_type.CopyFrom(sample_type)
# anatomical_site -> sampled_tissue
sampled_tissue = _map_anatomical_site(row['anatomical_site'])
if sampled_tissue is not None:
biosample.sampled_tissue.CopyFrom(sampled_tissue)

biosample.taxonomy.CopyFrom(HOMO_SAPIENS)
sample_type = _map_specimen_type(row['specimen_type'])
if sample_type is not None:
biosample.sample_type.CopyFrom(sample_type)

# primary_disease_type -> histological_diagnosis
histological_diagnosis = _map_primary_disease_type(row['primary_disease_type'])
if histological_diagnosis is not None:
biosample.histological_diagnosis.CopyFrom(histological_diagnosis)
biosample.taxonomy.CopyFrom(HOMO_SAPIENS)

material_sample = _map_source_material_type(row['source_material_type'])
if material_sample is not None:
biosample.material_sample.CopyFrom(material_sample)
# primary_disease_type -> histological_diagnosis
histological_diagnosis = _map_primary_disease_type(row['primary_disease_type'])
if histological_diagnosis is not None:
biosample.histological_diagnosis.CopyFrom(histological_diagnosis)

material_sample = _map_source_material_type(row['source_material_type'])
if material_sample is not None:
biosample.material_sample.CopyFrom(material_sample)

return biosample
return biosample


def _map_anatomical_site(val: typing.Optional[str]) -> typing.Optional[pp.OntologyClass]:
Expand Down
4 changes: 2 additions & 2 deletions src/oncoexporter/cda/cda_disease_factory.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import phenopackets as PPkt
import pandas as pd


from ..cda.mapper.op_diagnosis_mapper import OpDiagnosisMapper
from .mapper.op_mapper import OpMapper
from .mapper.op_diagnosis_mapper import OpDiagnosisMapper
from .cda_factory import CdaFactory


Expand Down
3 changes: 1 addition & 2 deletions src/oncoexporter/cda/cda_individual_factory.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import phenopackets as PPkt
import pandas as pd

from ..model.op_Individual import OpIndividual
from oncoexporter.model.op_Individual import OpIndividual
from .mapper.op_cause_of_death_mapper import OpCauseOfDeathMapper
from .cda_factory import CdaFactory

Expand All @@ -17,7 +17,6 @@ def __init__(self) -> None:
super().__init__()
self._cause_of_death_mapper = OpCauseOfDeathMapper()


@staticmethod
def days_to_iso(days: int):
"""
Expand Down
29 changes: 19 additions & 10 deletions medicalaction.py → ...exporter/cda/cda_medicalaction_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,19 +26,27 @@
Not_Otherwise_Specified = pp.OntologyClass(id='NCIT:C19594', label='Not Otherwise Specified')
UNKNOWN = pp.OntologyClass(id='NCIT:C17998', label='Unknown')





def make_cda_medicalaction(row: pd.Series) -> pp.MedicalAction:
medicalaction = pp.MedicalAction()

treatment = pp.Treatment()
treatment_type = row["treatment_type"]
if treatment_type == "Chemotherapy":
# Use the GA4GHTreatment message
# therapeutic_agent -> treatment agent
treatment_agent = _map_therapeutic_agent(row['therapeutic_agent'])
# radiation = _map_radiation(row['therapeutic_agent'])
if treatment_agent is not None:
treatment = pp.Treatment()
treatment.agent.CopyFrom(treatment_agent)
medicalaction.treatment.CopyFrom(treatment)
elif "Radiation Therapy, NOS" == treatment_type:
## Use GA4GH RadiationTherapy object
pass

# therapeutic_agent -> treatment agent
treatment_agent = _map_therapeutic_agent(row['therapeutic_agent'])
if treatment_agent is not None:
treatment.agent.CopyFrom(treatment_agent)

action = pp.OntologyClass()
action.CopyFrom(treatment)
medicalaction.action.CpyForm(action)

# treatment_outcome -> response_to_treatment
response_to_treatment = _map_response_to_treatment(row['treatment_outcome'])
Expand All @@ -47,7 +55,8 @@ def make_cda_medicalaction(row: pd.Series) -> pp.MedicalAction:

return medicalaction

def _map_therapeutic_agent(val: typing.Optional[str]=None) -> typing.Optional[pp.OntologyClass]:

def _map_response_to_treatment(val: typing.Optional[str]=None) -> typing.Optional[pp.OntologyClass]:
if val is not None:
val = val.lower()
if val == "progressive disease":
Expand Down
112 changes: 96 additions & 16 deletions src/oncoexporter/cda/cda_table_importer.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
from cdapython import (
Q
)
import os.path

from cdapython import Q
import phenopackets as PPkt
import typing
import pandas as pd
import pickle
from . import CdaDiseaseFactory
from .cda_importer import CdaImporter
from .cda_individual_factory import CdaIndividualFactory
from .cda_biosample import CdaBiosampleFactory
from .cda_mutation_factory import CdaMutationFactory
from .cda_medicalaction_factory import make_cda_medicalaction
from tqdm import tqdm

class CdaTableImporter(CdaImporter):


def __init__(self, query:str=None, query_obj:Q=None):
def __init__(self, query:str=None, query_obj:Q=None, use_cache=False):
"""
:param query: A query for CDA such as 'primary_diagnosis_site = "Lung"'
Expand All @@ -29,38 +33,114 @@ def __init__(self, query:str=None, query_obj:Q=None):
else:
raise ValueError("Need to pass either query or query_obj argument but not both")
self._ppackt_d = {} # key -- patient ID, value: PPkt.Phenopacket
self._use_cache = use_cache

def get_diagnosis_df(self, callable, cache_name: str):
print(f"Retrieving dataframe {cache_name}")
if self._use_cache and os.path.isfile(cache_name):
with open(cache_name, 'rb') as cachehandle:
print(f"loading cached dataframe from {cache_name}")
individual_df = pickle.load(cachehandle)
else:
print(f"calling CDA function")
individual_df = callable()
if self._use_cache:
print(f"Creating cached dataframe as {cache_name}")
with open(cache_name, 'wb') as f:
pickle.dump(individual_df, f)
return individual_df

def get_ga4gh_phenopackets(self) -> typing.List[PPkt.Phenopacket]:
"""
1.
"""
subject_id_to_interpretation = {}

individual_factory = CdaIndividualFactory()
individual_df = self._query.subject.run().get_all().to_dataframe()
for idx, row in tqdm(individual_df.iterrows(), total=len(individual_df), desc="individual messages"):
#for idx, row in individual_df.iterrows():
callable = lambda: self._query.subject.run().get_all().to_dataframe()
print("getting individual_df")
individual_df = self.get_diagnosis_df(callable, "individual_df.pkl")
print("obtained individual_df")
diagnosis_callable = lambda: self._query.diagnosis.run().get_all().to_dataframe()
diagnosis_df = self.get_diagnosis_df(diagnosis_callable, "diagnosis_df.pkl")
print("obtained diagnosis_df")
rsub_callable = lambda: self._query.researchsubject.run().get_all().to_dataframe()
rsub_df = self.get_diagnosis_df(rsub_callable, "rsub_df.pkl")
print("obtained rsub_df")

specimen_callable = lambda: self._query.specimen.run().get_all().to_dataframe()
specimen_df = self.get_diagnosis_df(specimen_callable, "specimen_df.pkl")

treatment_callable = lambda: self._query.treatment.run().get_all().to_dataframe()
treatment_df = self.get_diagnosis_df(treatment_callable, "treatment_df.pkl")

mutation_callable = lambda: self._query.mutation.run().get_all().to_dataframe()
mutation_df = self.get_diagnosis_df(mutation_callable, "mutation_df.pkl")

for idx, row in tqdm(individual_df.iterrows(),total=len(individual_df), desc= "individual dataframe"):
individual_message = individual_factory.from_cancer_data_aggregator(row=row)
individual_id = individual_message.id
indivudal_id = individual_message.id
interpretation = PPkt.Interpretation()
interpretation.id = "id"
interpretation.progress_status = PPkt.Interpretation.ProgressStatus.SOLVED
subject_id_to_interpretation[indivudal_id] = interpretation
ppackt = PPkt.Phenopacket()
ppackt.subject.CopyFrom(individual_message)
self._ppackt_d[individual_id] = ppackt
diagnosis_df = self._query.diagnosis.run().get_all().to_dataframe()
rsub_df = self._query.researchsubject.run().get_all().to_dataframe() # view the dataframe
self._ppackt_d[indivudal_id] = ppackt
merged_df = pd.merge(diagnosis_df, rsub_df, left_on='subject_id', right_on='subject_id',
suffixes=["_di", "_rs"])
disease_factory = CdaDiseaseFactory()
for idx, row in tqdm( merged_df.iterrows(), total=len(merged_df), desc="disease messages"):
#for idx, row in merged_df.iterrows():
for idx, row in tqdm(merged_df.iterrows(), total= len(merged_df.index), desc="merged diagnosis dataframe"):
disease_message = disease_factory.from_cancer_data_aggregator(row)
individual_id = row["subject_id"]
if individual_id not in subject_id_to_interpretation:
raise ValueError(f"Could not find individual id {individual_id} in subject_id_to_disease")
subject_id_to_interpretation.get(individual_id).diagnosis.disease.CopyFrom(disease_message.term)
if individual_id not in self._ppackt_d:
raise ValueError(f"Attempt to enter unknown individual ID from disease factory: \"{individual_id}\"")
self._ppackt_d.get(individual_id).diseases.append(disease_message)

specimen_factory = CdaBiosampleFactory()
for idx, row in tqdm(specimen_df.iterrows(),total= len(specimen_df.index), desc="specimen/biosample dataframe"):
biosample_message = specimen_factory.from_cancer_data_aggregator(row)
individual_id = row["subject_id"]
if individual_id not in self._ppackt_d:
raise ValueError(f"Attempt to enter unknown individual ID from biosample factory: \"{individual_id}\"")
self._ppackt_d.get(individual_id).biosamples.append(biosample_message)

mutation_factory = CdaMutationFactory()
for idx, row in tqdm(mutation_df.iterrows(), total=len(mutation_df.index), desc="mutation dataframe"):
individual_id = row["cda_subject_id"]
if individual_id not in subject_id_to_interpretation:
raise ValueError(f"Could not find individual id {individual_id} in subject_id_to_interpretation")
pp = self._ppackt_d[individual_id]
if len(pp.interpretations) == 0:
interpretation = PPkt.Interpretation()
disease = pp.diseases[0]
diagnosis = PPkt.Diagnosis()
diagnosis.disease.CopyFrom(disease.term)
interpretation.diagnosis.CopyFrom(diagnosis)
pp.interpretations.append(interpretation)
else:
diagnosis = pp.interpretations[0].diagnosis
variant_interpretation_message = mutation_factory.from_cancer_data_aggregator(row)
genomic_interpretation = PPkt.GenomicInterpretation()
# TODO -- CLEAN UP
genomic_interpretation.subject_or_biosample_id = row["Tumor_Aliquot_UUID"]
# by assumption, variants passed to this package are all causative -- ASK CDA
# genomic_interpretation.interpretation_status = PPkt.GenomicInterpretation.InterpretationStatus.CAUSATIVE
genomic_interpretation.variant_interpretation.CopyFrom(variant_interpretation_message)
diagnosis.genomic_interpretations.append(genomic_interpretation)

# make_cda_medicalaction
for idx, row in tqdm(treatment_df.iterrows(), total=len(treatment_df.index), desc="Treatment DF"):
individual_id = row["subject_id"]
medical_action_message = make_cda_medicalaction(row)
if individual_id not in self._ppackt_d:
raise ValueError(f"Attempt to enter unknown individual ID from treatemtn factory: \"{individual_id}\"")
self._ppackt_d.get(individual_id).medical_actions.append(medical_action_message)

spcimen_df = self._query.specimen.run().get_all().to_dataframe()
## get specimen message
## copy to corresponding phenopacket
return list(self._ppackt_d.values())

6 changes: 0 additions & 6 deletions src/oncoexporter/cda/mapper/op_mapper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import abc
from typing import Optional
import pandas as pd
import phenopackets as PPkt


Expand All @@ -11,11 +10,6 @@ class OpMapper(metaclass=abc.ABCMeta):
Superclass for mapper objects that map input data to Ontology classes.
Subclasses may create state in the constructor
"""
def __init__(self, ncit_obo=None) -> None:
# init OAK for NCIT
# init OAK for Mondo
# etc.
pass

@abc.abstractmethod
def get_ontology_term(self, row) ->Optional[PPkt.OntologyClass]:
Expand Down
1 change: 0 additions & 1 deletion src/oncoexporter/model/op_Individual.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import phenopackets as PPkt
import pandas as pd
from .op_message import OpMessage

class OpIndividual(OpMessage):
Expand Down
2 changes: 0 additions & 2 deletions src/oncoexporter/model/op_disease.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import phenopackets as PPkt
import pandas as pd
from .op_message import OpMessage

class OpDisease(OpMessage):
Expand Down
Empty file removed tests/__init__.py
Empty file.
Loading

0 comments on commit 2d83325

Please sign in to comment.