Skip to content

Commit

Permalink
updating lamindb
Browse files Browse the repository at this point in the history
  • Loading branch information
jkobject committed Feb 19, 2024
1 parent f2549f3 commit 96744c8
Show file tree
Hide file tree
Showing 8 changed files with 1,050 additions and 27 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ the idea is to use it to train models like scGPT / GeneFormer (and soon, scPrint

Currently one would have to use the preprocess function to make the dataset fit for different tools like scGPT / Geneformer. But I would want to enable it through different Collators. This is still missing and a WIP... (please do contribute!)

![](docs/scdataloader.drawio.png)
![docs/scdataloader.drawio.png]()

## Install it from PyPI

Expand Down
2 changes: 1 addition & 1 deletion docs/dataloader.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Documentation for `DataLoader`

::: scdataloader.dataloader.DataLoader
::: scdataloader.dataloader.DataModule
handler: python
2 changes: 2 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ It allows you to:

I needed to create this Data Loader for my PhD project. I am using it to load & preprocess thousands of datasets containing millions of cells in a few seconds. I believed that individuals employing AI for single-cell RNA sequencing and other sequencing datasets would eagerly utilize and desire such a tool, which presently does not exist.

![scdataloader.drawio.png]()

## Install it from PyPI

```bash
Expand Down
Binary file added docs/scdataloader.drawio.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
519 changes: 519 additions & 0 deletions notebooks/onto_rel.ipynb

Large diffs are not rendered by default.

494 changes: 494 additions & 0 deletions notebooks/rel_onto_tissues_age.ipynb

Large diffs are not rendered by default.

26 changes: 17 additions & 9 deletions scdataloader/data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from dataclasses import dataclass, field

import lamindb as ln
import lnschema_bionty as lb
import bionty as bt
import pandas as pd
from torch.utils.data import Dataset as torchDataset
from typing import Union
Expand Down Expand Up @@ -149,7 +149,12 @@ def encoder(self):

def __getitem__(self, *args, **kwargs):
item = self.mapped_dataset.__getitem__(*args, **kwargs)
#item.update({"unseen_genes": self.get_unseen_mapped_dataset_elements(*args, **kwargs)})
# import pdb

# pdb.set_trace()
# item.update(
# {"unseen_genes": self.get_unseen_mapped_dataset_elements(*args, **kwargs)}
# )
# ret = {}
# ret["count"] = item[0]
# for i, val in enumerate(self.obs):
Expand Down Expand Up @@ -189,7 +194,7 @@ def get_unseen_mapped_dataset_elements(self, idx):
# embeddings = []
# for o in self.organisms:
# genedf = genedfs[genedfs.organism == o]
# org_name = lb.Organism.filter(ontology_id=o).one().scientific_name
# org_name = bt.Organismntology_id=o).one().scientific_name
# embedding = embed(
# genedf=genedf,
# organism=org_name,
Expand Down Expand Up @@ -223,37 +228,37 @@ def define_hierarchies(self, labels):
)
elif label == "cell_type_ontology_term_id":
parentdf = (
lb.CellType.filter()
bt.CellType.filter()
.df(include=["parents__ontology_id"])
.set_index("ontology_id")
)
elif label == "tissue_ontology_term_id":
parentdf = (
lb.Tissue.filter()
bt.Tissue.filter()
.df(include=["parents__ontology_id"])
.set_index("ontology_id")
)
elif label == "disease_ontology_term_id":
parentdf = (
lb.Disease.filter()
bt.Disease.filter()
.df(include=["parents__ontology_id"])
.set_index("ontology_id")
)
elif label == "development_stage_ontology_term_id":
parentdf = (
lb.DevelopmentalStage.filter()
bt.DevelopmentalStage.filter()
.df(include=["parents__ontology_id"])
.set_index("ontology_id")
)
elif label == "assay_ontology_term_id":
parentdf = (
lb.ExperimentalFactor.filter()
bt.ExperimentalFactor.filter()
.df(include=["parents__ontology_id"])
.set_index("ontology_id")
)
elif label == "self_reported_ethnicity_ontology_term_id":
parentdf = (
lb.Ethnicity.filter()
bt.Ethnicity.filter()
.df(include=["parents__ontology_id"])
.set_index("ontology_id")
)
Expand All @@ -267,6 +272,9 @@ def define_hierarchies(self, labels):
cats = self.mapped_dataset.get_merged_categories(label)
addition = set(LABELS_TOADD.get(label, {}).values())
cats |= addition
# import pdb

# pdb.set_trace()
groupings, _, lclass = get_ancestry_mapping(cats, parentdf)
for i, j in groupings.items():
if len(j) == 0:
Expand Down
32 changes: 16 additions & 16 deletions scdataloader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,20 +360,20 @@ def populate_my_ontology(
dev_stages (list, optional): List of developmental stages. Defaults to [].
"""

names = bt.CellType.from_public().df().index if not celltypes else celltypes
names = bt.CellType.public().df().index if not celltypes else celltypes
records = bt.CellType.from_values(names, field="ontology_id")
ln.save(records)
ln.save(records, parents=bool(celltypes))
bt.CellType(name="unknown", ontology_id="unknown").save()
# Organism
names = bt.Organism.from_public().df().index if not organisms else organisms
names = bt.Organism.public().df().index if not organisms else organisms
records = [
i[0] if type(i) is list else i
for i in [bt.Organism.from_public(ontology_id=i) for i in names]
]
ln.save(records)
ln.save(records, parents=bool(organisms))
bt.Organism(name="unknown", ontology_id="unknown").save()
# Phenotype
names = bt.Phenotype.from_public().df().index if not sex else sex
names = bt.Phenotype.public().df().index if not sex else sex
records = [
bt.Phenotype.from_public(
ontology_id=i,
Expand All @@ -383,38 +383,38 @@ def populate_my_ontology(
)
for i in names
]
ln.save(records)
ln.save(records, parents=bool(sex))
bt.Phenotype(name="unknown", ontology_id="unknown").save()
# ethnicity
names = bt.Ethnicity.from_public().df().index if not ethnicities else ethnicities
names = bt.Ethnicity.public().df().index if not ethnicities else ethnicities
records = bt.Ethnicity.from_values(names, field="ontology_id")
ln.save(records)
ln.save(records, parents=bool(ethnicities))
bt.Ethnicity(
name="unknown", ontology_id="unknown"
).save() # multi ethnic will have to get renamed
# ExperimentalFactor
names = bt.ExperimentalFactor.from_public().df().index if not assays else assays
names = bt.ExperimentalFactor.public().df().index if not assays else assays
records = bt.ExperimentalFactor.from_values(names, field="ontology_id")
ln.save(records)
ln.save(records, parents=bool(assays))
bt.ExperimentalFactor(name="unknown", ontology_id="unknown").save()
# lookup = bt.ExperimentalFactor.lookup()
# lookup.smart_seq_v4.parents.add(lookup.smart_like)
# Tissue
names = bt.Tissue.from_public().df().index if not tissues else tissues
names = bt.Tissue.public().df().index if not tissues else tissues
records = bt.Tissue.from_values(names, field="ontology_id")
ln.save(records)
ln.save(records, parents=bool(tissues))
bt.Tissue(name="unknown", ontology_id="unknown").save()
# DevelopmentalStage
names = (
bt.DevelopmentalStage.from_public().df().index if not dev_stages else dev_stages
bt.DevelopmentalStage.public().df().index if not dev_stages else dev_stages
)
records = bt.DevelopmentalStage.from_values(names, field="ontology_id")
ln.save(records)
ln.save(records, parents=bool(dev_stages))
bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save()
# Disease
names = bt.Disease.from_public().df().index if not diseases else diseases
names = bt.Disease.public().df().index if not diseases else diseases
records = bt.Disease.from_values(names, field="ontology_id")
ln.save(records)
ln.save(records, parents=bool(diseases))
bt.Disease(name="normal", ontology_id="PATO:0000461").save()
bt.Disease(name="unknown", ontology_id="unknown").save()
# genes
Expand Down

0 comments on commit 96744c8

Please sign in to comment.