Merge pull request #13 from jkobject/dev

Dev Merge
jkobject · Jan 9, 2025 · 869fa59 · 869fa59
2 parents 29fee76 + 5fb8821
commit 869fa59
Show file tree

Hide file tree

Showing 18 changed files with 8,557 additions and 770 deletions.
diff --git a/.gitignore b/.gitignore
@@ -131,3 +131,7 @@ dmypy.json
 # templates
 .github/templates/*
 .DS_Store
+figures/*/*.png
+figures/*.png
+figures/add_postp_clust.py
+figures/age_relabel.py
diff --git a/Makefile b/Makefile
@@ -68,6 +68,7 @@ release:          ## Create a new tag for release.
 	@read -p "Version? (provide the next x.y.z semver) : " TAG
 	@echo "$${TAG}" > scdataloader/VERSION
 	@sed -i 's/^version = .*/version = "'$${TAG}'"/' pyproject.toml
+	@sed -i 's/__version__ = .*/__version__ = "'$${TAG}'"/' scdataloader/__init__.py
 	@$(ENV_PREFIX)gitchangelog > HISTORY.md
 	@git add scdataloader/VERSION HISTORY.md pyproject.toml
 	@git commit -m "release: version $${TAG} 🚀"

diff --git a/figures/debug.ipynb b/figures/debug.ipynb
diff --git a/notebooks/nonprimary.txt b/notebooks/nonprimary.txt
@@ -0,0 +1,4 @@
+OoktqBIu8jCoGOJl
+n33nFE2kXSNzNhIA
+mtoOxeGG0Rg3NPH1
+V0tqrgE1z1NY2eUU
diff --git a/notebooks/update_lamin_or_cellxgene.ipynb b/notebooks/update_lamin_or_cellxgene.ipynb
diff --git a/notebooks/work_on_dataloader_onto part 3.ipynb b/notebooks/work_on_dataloader_onto part 3.ipynb
@@ -928,24 +928,24 @@
       ],
       "source": [
         "# Celltype\n",
-        "names = bt.CellType().df().index\n",
+        "names = bt.CellType().filter().df().index\n",
         "records = lb.CellType.from_values(names, field=lb.CellType.ontology_id)\n",
         "ln.save(records)\n",
         "lb.CellType(name=\"unknown\", ontology_id=\"unknown\").save()\n",
         "# Organism\n",
-        "# names = bt.Organism().df().index\n",
+        "# names = bt.Organism().filter().df().index\n",
         "names = ['NCBITaxon:10090', 'NCBITaxon:9606']\n",
         "records = lb.Organism.from_values(names, field=lb.Organism.ontology_id)\n",
         "ln.save(records)\n",
         "lb.Organism(name=\"unknown\", ontology_id=\"unknown\").save()\n",
         "# Phenotype\n",
-        "#name = bt.Phenotype().df().index\n",
+        "#name = bt.Phenotype().filter().df().index\n",
         "name = df['sex_ontology_term_id'].unique()\n",
         "records = lb.Phenotype.from_values(name, field=lb.Phenotype.ontology_id)\n",
         "ln.save(records)\n",
         "lb.Phenotype(name=\"unknown\", ontology_id=\"unknown\").save()\n",
         "# ethnicity\n",
-        "names = bt.Ethnicity().df().index\n",
+        "names = bt.Ethnicity().filter().df().index\n",
         "records = lb.Ethnicity.from_values(names, field=lb.Ethnicity.ontology_id)\n",
         "ln.save(records)\n",
         "lb.Ethnicity(name=\"unknown\", ontology_id=\"unknown\").save() #multi ethnic will have to get renamed\n",
@@ -958,25 +958,25 @@
         "lookup = lb.ExperimentalFactor.lookup()\n",
         "lookup.smart_seq_v4.parents.add(lookup.smart_like)\n",
         "# Tissue\n",
-        "#names = bt.Tissue().df().index\n",
+        "#names = bt.Tissue().filter().df().index\n",
         "names= df['tissue_ontology_term_id'].unique()\n",
         "records = lb.Tissue.from_values(names, field=lb.Tissue.ontology_id)\n",
         "ln.save(records)\n",
         "lb.Tissue(name=\"unknown\", ontology_id=\"unknown\").save()\n",
         "# DevelopmentalStage\n",
-        "bionty_df = bt.DevelopmentalStage().df()\n",
+        "bionty_df = bt.DevelopmentalStage().filter().df()\n",
         "records = lb.DevelopmentalStage.from_values(bionty_df.index, field=lb.DevelopmentalStage.ontology_id, organism=\"mouse\")\n",
         "ln.save(records)\n",
         "lb.DevelopmentalStage(name=\"unknown\", ontology_id=\"unknown\").save()\n",
         "# Disease\n",
-        "# values = bt.Disease().df().index\n",
+        "# values = bt.Disease().filter().df().index\n",
         "values = df['disease_ontology_term_id'].unique()\n",
         "records = lb.Disease.from_values(values, field=lb.Disease.ontology_id)\n",
         "ln.save(records)\n",
         "lb.Disease(name=\"normal\", ontology_id=\"PATO:0000461\").save()\n",
         "lb.Disease(name=\"unknown\", ontology_id=\"unknown\").save()\n",
         "# genes\n",
-        "bionty_df = bt.Gene().df()\n",
+        "bionty_df = bt.Gene().filter().df()\n",
         "records = lb.Gene.from_values(bionty_df.index, field=lb.Gene.ontology_id)\n",
         "ln.save(records)"
       ]
@@ -1050,7 +1050,7 @@
         }
       ],
       "source": [
-        "lb.DevelopmentalStage(organism=\"mouse\").df()"
+        "lb.DevelopmentalStage(organism=\"mouse\").filter().df()"
       ]
     },
     {

diff --git a/pyproject.toml b/pyproject.toml
@@ -7,11 +7,11 @@ authors = [
 ]
 license = "MIT"
 readme = "README.md"
-requires-python = ">=3.10,<3.11"
+requires-python = ">=3.10,<3.14"
 keywords = ["scRNAseq", "dataloader", "pytorch", "lamindb", "scPRINT"]
 dependencies = [
-    "numpy>=1.26.0",
-    "lamindb[bionty]==0.76.12",
+    "numpy==1.26.0",
+    "lamindb[bionty]==0.77.2",
     "cellxgene-census>=0.1.0",
     "torch==2.2.0",
     "lightning>=2.0.0",

diff --git a/scdataloader/__init__.py b/scdataloader/__init__.py
@@ -2,3 +2,5 @@
 from .data import Dataset, SimpleAnnDataset
 from .datamodule import DataModule
 from .preprocess import Preprocessor
+
+__version__ = "1.6.5"
diff --git a/scdataloader/__main__.py b/scdataloader/__main__.py
@@ -10,7 +10,12 @@
 )
 
 
-# scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" --description="preprocessed for scprint" --new_name="scprint main" --start_at=39
+# scdataloader --instance="laminlabs/cellxgene" --name="cellxgene-census" --version="2023-12-15" \
+# --description="scPRINT-V2 datasets" --new_name="scprint v2" --n_hvg_for_postp=4000 --cache=False \
+# --filter_gene_by_counts=0 --filter_cell_by_counts=300 --min_valid_genes_id=500 \
+# --min_nnz_genes=120 --min_dataset_size=100 --maxdropamount=90 \
+# --organisms=["NCBITaxon:9606","NCBITaxon:9544","NCBITaxon:9483","NCBITaxon:10090"] \
+# --start_at=0
 def main():
     """
     main function to preprocess datasets in a given lamindb collection.
@@ -70,7 +75,7 @@ def main():
         help="Determines whether to normalize the total counts of each cell to a specific value.",
     )
     parser.add_argument(
-        "--subset_hvg",
+        "--n_hvg_for_postp",
         type=int,
         default=0,
         help="Determines whether to subset highly variable genes.",
@@ -120,7 +125,7 @@ def main():
     parser.add_argument(
         "--min_nnz_genes",
         type=int,
-        default=400,
+        default=200,
         help="Specifies the minimum non-zero genes.",
     )
     parser.add_argument(
@@ -139,7 +144,16 @@ def main():
         help="Specifies the percentage of MT outlier.",
     )
     parser.add_argument(
-        "--batch_key", type=Optional[str], default=None, help="Specifies the batch key."
+        "--batch_keys",
+        type=list[str],
+        default=[
+            "assay_ontology_term_id",
+            "self_reported_ethnicity_ontology_term_id",
+            "sex_ontology_term_id",
+            "donor_id",
+            "suspension_type",
+        ],
+        help="Specifies the batch keys.",
     )
     parser.add_argument(
         "--skip_validate",
@@ -150,15 +164,30 @@ def main():
     parser.add_argument(
         "--do_postp",
         type=bool,
-        default=False,
+        default=True,
         help="Determines whether to do postprocessing.",
     )
     parser.add_argument(
         "--cache",
         type=bool,
-        default=True,
+        default=False,
         help="Determines whether to cache the dataset.",
     )
+    parser.add_argument(
+        "--organisms",
+        type=list,
+        default=[
+            "NCBITaxon:9606",
+            "NCBITaxon:10090",
+        ],
+        help="Determines the organisms to keep.",
+    )
+    parser.add_argument(
+        "--force_preloaded",
+        type=bool,
+        default=False,
+        help="Determines whether the dataset is preloaded.",
+    )
     args = parser.parse_args()
 
     # Load the collection
@@ -182,7 +211,7 @@ def main():
         filter_gene_by_counts=args.filter_gene_by_counts,
         filter_cell_by_counts=args.filter_cell_by_counts,
         normalize_sum=args.normalize_sum,
-        subset_hvg=args.subset_hvg,
+        n_hvg_for_postp=args.n_hvg_for_postp,
         hvg_flavor=args.hvg_flavor,
         cache=args.cache,
         binning=args.binning,
@@ -195,12 +224,13 @@ def main():
         maxdropamount=args.maxdropamount,
         madoutlier=args.madoutlier,
         pct_mt_outlier=args.pct_mt_outlier,
-        batch_key=args.batch_key,
+        batch_keys=args.batch_keys,
         skip_validate=args.skip_validate,
         do_postp=args.do_postp,
         additional_preprocess=additional_preprocess,
         additional_postprocess=additional_postprocess,
         keep_files=False,
+        force_preloaded=args.force_preloaded,
     )
 
     # Preprocess the dataset

diff --git a/scdataloader/collator.py b/scdataloader/collator.py
@@ -131,6 +131,7 @@ def __call__(self, batch) -> dict[str, Tensor]:
         tp = []
         dataset = []
         nnz_loc = []
+        is_meta = []
         for elem in batch:
             organism_id = elem[self.organism_name]
             if organism_id not in self.organism_ids:
@@ -188,12 +189,12 @@ def __call__(self, batch) -> dict[str, Tensor]:
                 loc = loc[self.to_subset[organism_id]]
             exprs.append(expr)
             gene_locs.append(loc)
-
+            if "is_meta" in elem:
+                is_meta.append(elem["is_meta"])
             if self.tp_name is not None:
                 tp.append(elem[self.tp_name])
             else:
                 tp.append(0)
-
             other_classes.append([elem[i] for i in self.class_names])
 
         expr = np.array(exprs)
@@ -202,6 +203,7 @@ def __call__(self, batch) -> dict[str, Tensor]:
         total_count = np.array(total_count)
         other_classes = np.array(other_classes)
         dataset = np.array(dataset)
+        is_meta = np.array(is_meta)
 
         # normalize counts
         if self.norm_to is not None:
@@ -229,6 +231,8 @@ def __call__(self, batch) -> dict[str, Tensor]:
             "tp": Tensor(tp),
             "depth": Tensor(total_count),
         }
+        if len(is_meta) > 0:
+            ret.update({"is_meta": Tensor(is_meta)})
         if len(dataset) > 0:
             ret.update({"dataset": Tensor(dataset).to(long)})
         if self.downsample is not None:

diff --git a/scdataloader/config.py b/scdataloader/config.py
@@ -110,3 +110,102 @@
     "TruDrop": "",
     "Visium Spatial Gene Expression": "",
 }
+
+
+MAIN_HUMAN_MOUSE_DEV_STAGE_MAP = {
+    "HsapDv:0010000": [
+        "MmusDv:0000092",  # postnatal stage
+    ],
+    "HsapDv:0000258": [  # mature stage
+        "MmusDv:0000110",  # mature stage
+        "HsapDv:0000204",
+    ],
+    "HsapDv:0000227": [  # late adult stage
+        "MmusDv:0000091",  # 20 month-old stage
+        "MmusDv:0000089",  # 18 month-old stage
+    ],
+    "HsapDv:0000272": [],  # 60-79 year-old stage
+    "HsapDv:0000095": [],  # 80 year-old and over stage
+    "HsapDv:0000267": [  # middle aged stage
+        "MmusDv:0000087",  # 16 month-old stage
+        "UBERON:0018241",  # prime adult stage
+        "MmusDv:0000083",  # 12 month-old stage
+        "HsapDv:0000092",  # same
+    ],
+    "HsapDv:0000266": [  # young adult stage
+        "MmusDv:0000050",  # 6 weeks
+        "HsapDv:0000089",  # same
+        "MmusDv:0000051",  # 7 weeks
+        "MmusDv:0000052",  # 8 weeks
+        "MmusDv:0000053",  # 9 weeks
+        "MmusDv:0000054",  # 10 weeks
+        "MmusDv:0000055",  # 11 weeks
+        "MmusDv:0000056",  # 12 weeks
+        "MmusDv:0000057",  # 13 weeks
+        "MmusDv:0000058",  # 14 weeks
+        "MmusDv:0000059",  # 15 weeks
+        "MmusDv:0000061",  # early adult stage
+        "MmusDv:0000062",  # 2 month-old stage
+        "MmusDv:0000063",  # 3 month-old stage
+        "MmusDv:0000064",  # 4 month-old stage
+        "MmusDv:0000065",  # 16 weeks
+        "MmusDv:0000066",  # 17 weeks
+        "MmusDv:0000067",  # 18 weeks
+        "MmusDv:0000068",  # 19 weeks
+        "MmusDv:0000070",  # 20 weeks
+        "MmusDv:0000071",  # 21 weeks
+        "MmusDv:0000072",  # 22 weeks
+        "MmusDv:0000073",  # 23 weeks
+        "MmusDv:0000074",  # 24 weeks
+        "MmusDv:0000077",  # 6 month-old stage
+        "MmusDv:0000079",  # 8 month-old stage
+        "MmusDv:0000098",  # 25 weeks
+        "MmusDv:0000099",  # 26 weeks
+        "MmusDv:0000102",  # 29 weeks
+    ],
+    "HsapDv:0000265": [],  # child stage (1-4 yo)
+    "HsapDv:0000271": [  # juvenile stage (5-14 yo)
+        "MmusDv:0000048",  # 4 weeks
+        "MmusDv:0000049",  # 5 weeks
+    ],
+    "HsapDv:0000260": [  # infant stage
+        "MmusDv:0000046",  # 2 weeks
+        "MmusDv:0000045",  # 1 week
+        "MmusDv:0000047",  # 3 weeks
+        "HsapDv:0000083",
+    ],
+    "HsapDv:0000262": [  # newborn stage (0-28 days)
+        "MmusDv:0000036",  # Theiler stage 27
+        "MmusDv:0000037",  # Theiler stage 28
+        "MmusDv:0000113",  # 4-7 days
+    ],
+    "HsapDv:0000007": [],  # Carnegie stage 03
+    "HsapDv:0000008": [],  # Carnegie stage 04
+    "HsapDv:0000009": [],  # Carnegie stage 05
+    "HsapDv:0000003": [],  # Carnegie stage 01
+    "HsapDv:0000005": [],  # Carnegie stage 02
+    "HsapDv:0000010": [],  # gastrula stage
+    "HsapDv:0000012": [],  # neurula stage
+    "HsapDv:0000015": [  # organogenesis stage
+        "MmusDv:0000019",  # Theiler stage 13
+        "MmusDv:0000020",  # Theiler stage 12
+        "MmusDv:0000021",  # Theiler stage 14
+        "MmusDv:0000022",  # Theiler stage 15
+        "MmusDv:0000023",  # Theiler stage 16
+        "MmusDv:0000024",  # Theiler stage 17
+        "MmusDv:0000025",  # Theiler stage 18
+        "MmusDv:0000026",  # Theiler stage 19
+        "MmusDv:0000027",  # Theiler stage 20
+        "MmusDv:0000028",  # Theiler stage 21
+        "MmusDv:0000029",  # Theiler stage 22
+    ],
+    "HsapDv:0000037": [  # fetal stage
+        "MmusDv:0000033",  # Theiler stage 24
+        "MmusDv:0000034",  # Theiler stage 25
+        "MmusDv:0000035",  # Theiler stage 26
+        "MmusDv:0000032",  # Theiler stage 23
+    ],
+    "unknown": [
+        "MmusDv:0000041",  # unknown
+    ],
+}