diff --git a/docs/index.md b/docs/index.md index 29dc922..f11ef1b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -41,16 +41,42 @@ I needed to create this Data Loader for my PhD project. I am using it to load & ```bash pip install scdataloader +# or +pip install scDataLoader[dev] # for dev dependencies + +lamin login --key +lamin init --storage [folder-name-where-lamin-data-will-be-stored] --schema bionty +``` + +if you start with lamin and had to do a `lamin init`, you will also need to populate your ontologies. This is because scPRINT is using ontologies to define its cell types, diseases, sexes, ethnicities, etc. + +you can do it manually or with our function: + +```python +from scdataloader.utils import populate_my_ontology + +populate_my_ontology() #to populate everything (recommended) (can take 2-10mns) + +populate_my_ontology( #the minimum for scprint to run some inferences (denoising, grn inference) +organisms: List[str] = ["NCBITaxon:10090", "NCBITaxon:9606"], + sex: List[str] = ["PATO:0000384", "PATO:0000383"], + celltypes = None, + ethnicities = None, + assays = None, + tissues = None, + diseases = None, + dev_stages = None, +) ``` -### Install it locally and run the notebooks: +### Dev install + +If you want to use the latest version of scDataLoader and work on the code yourself use `git clone` and `pip -e` instead of `pip install`. ```bash git clone https://github.com/jkobject/scDataLoader.git -cd scDataLoader -poetry install +pip install -e scDataLoader[dev] ``` -then run the notebooks with the poetry installed environment ## Usage @@ -98,7 +124,7 @@ for i in tqdm.tqdm(datamodule.train_dataloader()): ``` -see the notebooks in [docs](https://www.jkobject.com/scDataLoader/): +see the notebooks: 1. [load a dataset](https://www.jkobject.com/scDataLoader/notebooks/1_download_and_preprocess/) 2. [create a dataset](https://www.jkobject.com/scDataLoader/notebooks/2_create_dataloader/) @@ -117,13 +143,34 @@ The main way to use > please refer to the [scPRINT documentation](https://www.jkobject.com/scPRINT/) and [lightning documentation](https://lightning.ai/docs/pytorch/stable/cli/lightning_cli_intermediate.html) for more information on command line usage +## FAQ + +### how to update my ontologies? + +```bash +import bionty as bt +bt.reset_sources() + +# Run via CLI: lamin load + +import lnschema_bionty as lb +lb.dev.sync_bionty_source_to_latest() +``` + +### how to load all ontologies? + +```python +from scdataloader import utils +utils.populate_ontologies() # this might take from 5-20mins +``` + ## Development -Read the [CONTRIBUTING.md](../CONTRIBUTING.md) file. +Read the [CONTRIBUTING.md](https://github.com/jkobject/scDataLoader/blob/main/CONTRIBUTING.md) file. ## License -This project is licensed under the MIT License - see the [LICENSE](../LICENSE) file for details. +This project is licensed under the MIT License - see the [LICENSE](https://github.com/jkobject/scDataLoader/blob/main/LICENSE) file for details. ## Acknowledgments diff --git a/scdataloader/utils.py b/scdataloader/utils.py index ca160e4..04de9c5 100644 --- a/scdataloader/utils.py +++ b/scdataloader/utils.py @@ -387,6 +387,7 @@ def populate_my_ontology( tissues: List[str] = [], diseases: List[str] = [], dev_stages: List[str] = [], + organism_clade: str = "vertebrates", ): """ creates a local version of the lamin ontologies and add the required missing values in base ontologies @@ -420,22 +421,26 @@ def populate_my_ontology( bt.CellType(name="unknown", ontology_id="unknown").save() # Organism if organisms is not None: - names = bt.Organism.public().df().index if not organisms else organisms + names = ( + bt.Organism.public(organism=organism_clade).df().index + if not organisms + else organisms + ) + source = bt.PublicSource.filter(name="ensembl", organism=organism_clade).last() records = [ i[0] if type(i) is list else i - for i in [bt.Organism.from_source(ontology_id=i) for i in names] + for i in [ + bt.Organism.from_source(ontology_id=i, source=source) for i in names + ] ] ln.save(records) bt.Organism(name="unknown", ontology_id="unknown").save() - organism_names = names # Phenotype if sex is not None: names = bt.Phenotype.public().df().index if not sex else sex + source = bt.PublicSource.filter(name="pato").first() records = [ - bt.Phenotype.from_source( - ontology_id=i, source=bt.PublicSource.filter(name="pato").first() - ) - for i in names + bt.Phenotype.from_source(ontology_id=i, source=source) for i in names ] ln.save(records) bt.Phenotype(name="unknown", ontology_id="unknown").save() @@ -468,15 +473,17 @@ def populate_my_ontology( ) records = bt.DevelopmentalStage.from_values(names, field="ontology_id") ln.save(records) - bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save() + # bt.DevelopmentalStage(name="unknown", ontology_id="unknown").save() names = bt.DevelopmentalStage.public(organism="mouse").df().index + names = [i for i in names if i != "unknown"] + source = bt.PublicSource.filter(organism="mouse", name="mmusdv").last() records = [ - bt.DevelopmentalStage.from_source( - ontology_id=i, - source=bt.PublicSource.filter(organism="mouse", name="mmusdv").first(), - ) - for i in names.tolist() + i[0] if type(i) is list else i + for i in [ + bt.DevelopmentalStage.from_source(ontology_id=i, source=source) + for i in names + ] ] ln.save(records) # Disease @@ -487,7 +494,7 @@ def populate_my_ontology( bt.Disease(name="normal", ontology_id="PATO:0000461").save() bt.Disease(name="unknown", ontology_id="unknown").save() # genes - for organism in organism_names: + for organism in ["NCBITaxon:10090", "NCBITaxon:9606"]: # convert onto to name organism = bt.Organism.filter(ontology_id=organism).one().name names = bt.Gene.public(organism=organism).df()["ensembl_gene_id"]