Skip to content

Commit

Permalink
Merge pull request #13 from jkobject/dev
Browse files Browse the repository at this point in the history
Dev Merge
  • Loading branch information
jkobject authored Jan 9, 2025
2 parents 29fee76 + 5fb8821 commit 869fa59
Show file tree
Hide file tree
Showing 18 changed files with 8,557 additions and 770 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,7 @@ dmypy.json
# templates
.github/templates/*
.DS_Store
figures/*/*.png
figures/*.png
figures/add_postp_clust.py
figures/age_relabel.py
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ release: ## Create a new tag for release.
@read -p "Version? (provide the next x.y.z semver) : " TAG
@echo "$${TAG}" > scdataloader/VERSION
@sed -i 's/^version = .*/version = "'$${TAG}'"/' pyproject.toml
@sed -i 's/__version__ = .*/__version__ = "'$${TAG}'"/' scdataloader/__init__.py
@$(ENV_PREFIX)gitchangelog > HISTORY.md
@git add scdataloader/VERSION HISTORY.md pyproject.toml
@git commit -m "release: version $${TAG} 🚀"
Expand Down
1,555 changes: 1,555 additions & 0 deletions figures/debug.ipynb

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions notebooks/nonprimary.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
OoktqBIu8jCoGOJl
n33nFE2kXSNzNhIA
mtoOxeGG0Rg3NPH1
V0tqrgE1z1NY2eUU
4,259 changes: 4,259 additions & 0 deletions notebooks/update_lamin_or_cellxgene.ipynb

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions notebooks/work_on_dataloader_onto part 3.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -928,24 +928,24 @@
],
"source": [
"# Celltype\n",
"names = bt.CellType().df().index\n",
"names = bt.CellType().filter().df().index\n",
"records = lb.CellType.from_values(names, field=lb.CellType.ontology_id)\n",
"ln.save(records)\n",
"lb.CellType(name=\"unknown\", ontology_id=\"unknown\").save()\n",
"# Organism\n",
"# names = bt.Organism().df().index\n",
"# names = bt.Organism().filter().df().index\n",
"names = ['NCBITaxon:10090', 'NCBITaxon:9606']\n",
"records = lb.Organism.from_values(names, field=lb.Organism.ontology_id)\n",
"ln.save(records)\n",
"lb.Organism(name=\"unknown\", ontology_id=\"unknown\").save()\n",
"# Phenotype\n",
"#name = bt.Phenotype().df().index\n",
"#name = bt.Phenotype().filter().df().index\n",
"name = df['sex_ontology_term_id'].unique()\n",
"records = lb.Phenotype.from_values(name, field=lb.Phenotype.ontology_id)\n",
"ln.save(records)\n",
"lb.Phenotype(name=\"unknown\", ontology_id=\"unknown\").save()\n",
"# ethnicity\n",
"names = bt.Ethnicity().df().index\n",
"names = bt.Ethnicity().filter().df().index\n",
"records = lb.Ethnicity.from_values(names, field=lb.Ethnicity.ontology_id)\n",
"ln.save(records)\n",
"lb.Ethnicity(name=\"unknown\", ontology_id=\"unknown\").save() #multi ethnic will have to get renamed\n",
Expand All @@ -958,25 +958,25 @@
"lookup = lb.ExperimentalFactor.lookup()\n",
"lookup.smart_seq_v4.parents.add(lookup.smart_like)\n",
"# Tissue\n",
"#names = bt.Tissue().df().index\n",
"#names = bt.Tissue().filter().df().index\n",
"names= df['tissue_ontology_term_id'].unique()\n",
"records = lb.Tissue.from_values(names, field=lb.Tissue.ontology_id)\n",
"ln.save(records)\n",
"lb.Tissue(name=\"unknown\", ontology_id=\"unknown\").save()\n",
"# DevelopmentalStage\n",
"bionty_df = bt.DevelopmentalStage().df()\n",
"bionty_df = bt.DevelopmentalStage().filter().df()\n",
"records = lb.DevelopmentalStage.from_values(bionty_df.index, field=lb.DevelopmentalStage.ontology_id, organism=\"mouse\")\n",
"ln.save(records)\n",
"lb.DevelopmentalStage(name=\"unknown\", ontology_id=\"unknown\").save()\n",
"# Disease\n",
"# values = bt.Disease().df().index\n",
"# values = bt.Disease().filter().df().index\n",
"values = df['disease_ontology_term_id'].unique()\n",
"records = lb.Disease.from_values(values, field=lb.Disease.ontology_id)\n",
"ln.save(records)\n",
"lb.Disease(name=\"normal\", ontology_id=\"PATO:0000461\").save()\n",
"lb.Disease(name=\"unknown\", ontology_id=\"unknown\").save()\n",
"# genes\n",
"bionty_df = bt.Gene().df()\n",
"bionty_df = bt.Gene().filter().df()\n",
"records = lb.Gene.from_values(bionty_df.index, field=lb.Gene.ontology_id)\n",
"ln.save(records)"
]
Expand Down Expand Up @@ -1050,7 +1050,7 @@
}
],
"source": [
"lb.DevelopmentalStage(organism=\"mouse\").df()"
"lb.DevelopmentalStage(organism=\"mouse\").filter().df()"
]
},
{
Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ authors = [
]
license = "MIT"
readme = "README.md"
requires-python = ">=3.10,<3.11"
requires-python = ">=3.10,<3.14"
keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPRINT"]
dependencies = [
"numpy>=1.26.0",
"lamindb[bionty]==0.76.12",
"numpy==1.26.0",
"lamindb[bionty]==0.77.2",
"cellxgene-census>=0.1.0",
"torch==2.2.0",
"lightning>=2.0.0",
Expand Down
2 changes: 2 additions & 0 deletions scdataloader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@
from .data import Dataset, SimpleAnnDataset
from .datamodule import DataModule
from .preprocess import Preprocessor

__version__ = "1.6.5"
46 changes: 38 additions & 8 deletions scdataloader/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,12 @@
)


# scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" --description="preprocessed for scprint" --new_name="scprint main" --start_at=39
# scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" \
# --description="scPRINT-V2 datasets" --new_name="scprint v2" --n_hvg_for_postp=4000 --cache=False \
# --filter_gene_by_counts=0 --filter_cell_by_counts=300 --min_valid_genes_id=500 \
# --min_nnz_genes=120 --min_dataset_size=100 --maxdropamount=90 \
# --organisms=["NCBITaxon:9606","NCBITaxon:9544","NCBITaxon:9483","NCBITaxon:10090"] \
# --start_at=0
def main():
"""
main function to preprocess datasets in a given lamindb collection.
Expand Down Expand Up @@ -70,7 +75,7 @@ def main():
help="Determines whether to normalize the total counts of each cell to a specific value.",
)
parser.add_argument(
"--subset_hvg",
"--n_hvg_for_postp",
type=int,
default=0,
help="Determines whether to subset highly variable genes.",
Expand Down Expand Up @@ -120,7 +125,7 @@ def main():
parser.add_argument(
"--min_nnz_genes",
type=int,
default=400,
default=200,
help="Specifies the minimum non-zero genes.",
)
parser.add_argument(
Expand All @@ -139,7 +144,16 @@ def main():
help="Specifies the percentage of MT outlier.",
)
parser.add_argument(
"--batch_key", type=Optional[str], default=None, help="Specifies the batch key."
"--batch_keys",
type=list[str],
default=[
"assay_ontology_term_id",
"self_reported_ethnicity_ontology_term_id",
"sex_ontology_term_id",
"donor_id",
"suspension_type",
],
help="Specifies the batch keys.",
)
parser.add_argument(
"--skip_validate",
Expand All @@ -150,15 +164,30 @@ def main():
parser.add_argument(
"--do_postp",
type=bool,
default=False,
default=True,
help="Determines whether to do postprocessing.",
)
parser.add_argument(
"--cache",
type=bool,
default=True,
default=False,
help="Determines whether to cache the dataset.",
)
parser.add_argument(
"--organisms",
type=list,
default=[
"NCBITaxon:9606",
"NCBITaxon:10090",
],
help="Determines the organisms to keep.",
)
parser.add_argument(
"--force_preloaded",
type=bool,
default=False,
help="Determines whether the dataset is preloaded.",
)
args = parser.parse_args()

# Load the collection
Expand All @@ -182,7 +211,7 @@ def main():
filter_gene_by_counts=args.filter_gene_by_counts,
filter_cell_by_counts=args.filter_cell_by_counts,
normalize_sum=args.normalize_sum,
subset_hvg=args.subset_hvg,
n_hvg_for_postp=args.n_hvg_for_postp,
hvg_flavor=args.hvg_flavor,
cache=args.cache,
binning=args.binning,
Expand All @@ -195,12 +224,13 @@ def main():
maxdropamount=args.maxdropamount,
madoutlier=args.madoutlier,
pct_mt_outlier=args.pct_mt_outlier,
batch_key=args.batch_key,
batch_keys=args.batch_keys,
skip_validate=args.skip_validate,
do_postp=args.do_postp,
additional_preprocess=additional_preprocess,
additional_postprocess=additional_postprocess,
keep_files=False,
force_preloaded=args.force_preloaded,
)

# Preprocess the dataset
Expand Down
8 changes: 6 additions & 2 deletions scdataloader/collator.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def __call__(self, batch) -> dict[str, Tensor]:
tp = []
dataset = []
nnz_loc = []
is_meta = []
for elem in batch:
organism_id = elem[self.organism_name]
if organism_id not in self.organism_ids:
Expand Down Expand Up @@ -188,12 +189,12 @@ def __call__(self, batch) -> dict[str, Tensor]:
loc = loc[self.to_subset[organism_id]]
exprs.append(expr)
gene_locs.append(loc)

if "is_meta" in elem:
is_meta.append(elem["is_meta"])
if self.tp_name is not None:
tp.append(elem[self.tp_name])
else:
tp.append(0)

other_classes.append([elem[i] for i in self.class_names])

expr = np.array(exprs)
Expand All @@ -202,6 +203,7 @@ def __call__(self, batch) -> dict[str, Tensor]:
total_count = np.array(total_count)
other_classes = np.array(other_classes)
dataset = np.array(dataset)
is_meta = np.array(is_meta)

# normalize counts
if self.norm_to is not None:
Expand Down Expand Up @@ -229,6 +231,8 @@ def __call__(self, batch) -> dict[str, Tensor]:
"tp": Tensor(tp),
"depth": Tensor(total_count),
}
if len(is_meta) > 0:
ret.update({"is_meta": Tensor(is_meta)})
if len(dataset) > 0:
ret.update({"dataset": Tensor(dataset).to(long)})
if self.downsample is not None:
Expand Down
99 changes: 99 additions & 0 deletions scdataloader/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,102 @@
"TruDrop": "",
"Visium Spatial Gene Expression": "",
}


MAIN_HUMAN_MOUSE_DEV_STAGE_MAP = {
"HsapDv:0010000": [
"MmusDv:0000092", # postnatal stage
],
"HsapDv:0000258": [ # mature stage
"MmusDv:0000110", # mature stage
"HsapDv:0000204",
],
"HsapDv:0000227": [ # late adult stage
"MmusDv:0000091", # 20 month-old stage
"MmusDv:0000089", # 18 month-old stage
],
"HsapDv:0000272": [], # 60-79 year-old stage
"HsapDv:0000095": [], # 80 year-old and over stage
"HsapDv:0000267": [ # middle aged stage
"MmusDv:0000087", # 16 month-old stage
"UBERON:0018241", # prime adult stage
"MmusDv:0000083", # 12 month-old stage
"HsapDv:0000092", # same
],
"HsapDv:0000266": [ # young adult stage
"MmusDv:0000050", # 6 weeks
"HsapDv:0000089", # same
"MmusDv:0000051", # 7 weeks
"MmusDv:0000052", # 8 weeks
"MmusDv:0000053", # 9 weeks
"MmusDv:0000054", # 10 weeks
"MmusDv:0000055", # 11 weeks
"MmusDv:0000056", # 12 weeks
"MmusDv:0000057", # 13 weeks
"MmusDv:0000058", # 14 weeks
"MmusDv:0000059", # 15 weeks
"MmusDv:0000061", # early adult stage
"MmusDv:0000062", # 2 month-old stage
"MmusDv:0000063", # 3 month-old stage
"MmusDv:0000064", # 4 month-old stage
"MmusDv:0000065", # 16 weeks
"MmusDv:0000066", # 17 weeks
"MmusDv:0000067", # 18 weeks
"MmusDv:0000068", # 19 weeks
"MmusDv:0000070", # 20 weeks
"MmusDv:0000071", # 21 weeks
"MmusDv:0000072", # 22 weeks
"MmusDv:0000073", # 23 weeks
"MmusDv:0000074", # 24 weeks
"MmusDv:0000077", # 6 month-old stage
"MmusDv:0000079", # 8 month-old stage
"MmusDv:0000098", # 25 weeks
"MmusDv:0000099", # 26 weeks
"MmusDv:0000102", # 29 weeks
],
"HsapDv:0000265": [], # child stage (1-4 yo)
"HsapDv:0000271": [ # juvenile stage (5-14 yo)
"MmusDv:0000048", # 4 weeks
"MmusDv:0000049", # 5 weeks
],
"HsapDv:0000260": [ # infant stage
"MmusDv:0000046", # 2 weeks
"MmusDv:0000045", # 1 week
"MmusDv:0000047", # 3 weeks
"HsapDv:0000083",
],
"HsapDv:0000262": [ # newborn stage (0-28 days)
"MmusDv:0000036", # Theiler stage 27
"MmusDv:0000037", # Theiler stage 28
"MmusDv:0000113", # 4-7 days
],
"HsapDv:0000007": [], # Carnegie stage 03
"HsapDv:0000008": [], # Carnegie stage 04
"HsapDv:0000009": [], # Carnegie stage 05
"HsapDv:0000003": [], # Carnegie stage 01
"HsapDv:0000005": [], # Carnegie stage 02
"HsapDv:0000010": [], # gastrula stage
"HsapDv:0000012": [], # neurula stage
"HsapDv:0000015": [ # organogenesis stage
"MmusDv:0000019", # Theiler stage 13
"MmusDv:0000020", # Theiler stage 12
"MmusDv:0000021", # Theiler stage 14
"MmusDv:0000022", # Theiler stage 15
"MmusDv:0000023", # Theiler stage 16
"MmusDv:0000024", # Theiler stage 17
"MmusDv:0000025", # Theiler stage 18
"MmusDv:0000026", # Theiler stage 19
"MmusDv:0000027", # Theiler stage 20
"MmusDv:0000028", # Theiler stage 21
"MmusDv:0000029", # Theiler stage 22
],
"HsapDv:0000037": [ # fetal stage
"MmusDv:0000033", # Theiler stage 24
"MmusDv:0000034", # Theiler stage 25
"MmusDv:0000035", # Theiler stage 26
"MmusDv:0000032", # Theiler stage 23
],
"unknown": [
"MmusDv:0000041", # unknown
],
}
Loading

0 comments on commit 869fa59

Please sign in to comment.