openproblems-bio · rcannood · Dec 19, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,11 @@
 
 * Added `methods/limma_remove_batch_effect` component (PR #7).
 
+* Added three negative control methods (PR #8):
+  - `control_methods/shuffle_integration`
+  - `control_methods/shuffle_integration_by_batch`
+  - `control_methods/shuffle_integration_by_cell_type`
+
 ## MAJOR CHANGES
 
 ## MINOR CHANGES

diff --git a/src/control_methods/shuffle_integration/config.vsh.yaml b/src/control_methods/shuffle_integration/config.vsh.yaml
@@ -0,0 +1,17 @@
+__merge__: /src/api/comp_control_method.yaml
+name: shuffle_integration
+label: Shuffle integration
+summary: Integrations are randomly permuted
+description: Integrations are randomly permuted
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/control_methods/utils.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, lowcpu]
diff --git a/src/control_methods/shuffle_integration/script.py b/src/control_methods/shuffle_integration/script.py
@@ -0,0 +1,43 @@
+import anndata as ad
+import sys
+
+## VIASH START
+par = {
+    "input_unintegrated": "resources_test/task_cyto_batch_integration/starter_file/unintegrated_censored.h5ad",
+    "output": "output.h5ad",
+}
+meta = {"name": "harmonypy"}
+## VIASH END
+
+print("Importing helper functions", flush=True)
+sys.path.append(meta["resources_dir"])
+from utils import _randomize_features
+
+print("Reading and preparing input files", flush=True)
+adata = ad.read_h5ad(par["input_unintegrated"])
+
+adata.obs["batch_str"] = adata.obs["batch"].astype(str)
+
+markers_to_correct = adata.var[adata.var["to_correct"]].index.to_numpy()
+
+adata = adata[:, markers_to_correct]
+
+print("Randomise features", flush=True)
+integrated = _randomize_features(
+    adata.layers["preprocessed"]
+)
+
+# create new anndata
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    layers={"integrated": integrated},
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "method_id": meta["name"],
+        "parameters": {},
+    },
+)
+
+print("Write output AnnData to file", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
diff --git a/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml b/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml
@@ -0,0 +1,17 @@
+__merge__: /src/api/comp_control_method.yaml
+name: shuffle_integration_by_batch
+label: Shuffle integration by batch
+summary: Integrations are randomly permuted within each batch
+description: Integrations are randomly permuted within each batch
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/control_methods/utils.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, lowcpu]
diff --git a/src/control_methods/shuffle_integration_by_batch/script.py b/src/control_methods/shuffle_integration_by_batch/script.py
@@ -0,0 +1,44 @@
+import anndata as ad
+import sys
+
+## VIASH START
+par = {
+    "input_unintegrated": "resources_test/task_cyto_batch_integration/starter_file/unintegrated_censored.h5ad",
+    "output": "output.h5ad",
+}
+meta = {"name": "harmonypy"}
+## VIASH END
+
+print("Importing helper functions", flush=True)
+sys.path.append(meta["resources_dir"])
+from utils import _randomize_features
+
+print("Reading and preparing input files", flush=True)
+adata = ad.read_h5ad(par["input_unintegrated"])
+
+adata.obs["batch_str"] = adata.obs["batch"].astype(str)
+
+markers_to_correct = adata.var[adata.var["to_correct"]].index.to_numpy()
+
+adata = adata[:, markers_to_correct]
+
+print("Randomise features", flush=True)
+integrated = _randomize_features(
+    adata.layers["preprocessed"],
+    partition=adata.obs["batch"],
+)
+
+# create new anndata
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    layers={"integrated": integrated},
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "method_id": meta["name"],
+        "parameters": {},
+    },
+)
+
+print("Write output AnnData to file", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
diff --git a/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml b/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml
@@ -0,0 +1,17 @@
+__merge__: /src/api/comp_control_method.yaml
+name: shuffle_integration_by_cell_type
+label: Shuffle integration by cell type
+summary: Integrations are randomly permuted within each cell type
+description: Integrations are randomly permuted within each cell type
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/control_methods/utils.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, lowcpu]
diff --git a/src/control_methods/shuffle_integration_by_cell_type/script.py b/src/control_methods/shuffle_integration_by_cell_type/script.py
@@ -0,0 +1,44 @@
+import anndata as ad
+import sys
+
+## VIASH START
+par = {
+    "input_unintegrated": "resources_test/task_cyto_batch_integration/starter_file/unintegrated_censored.h5ad",
+    "output": "output.h5ad",
+}
+meta = {"name": "harmonypy"}
+## VIASH END
+
+print("Importing helper functions", flush=True)
+sys.path.append(meta["resources_dir"])
+from utils import _randomize_features
+
+print("Reading and preparing input files", flush=True)
+adata = ad.read_h5ad(par["input_unintegrated"])
+
+adata.obs["batch_str"] = adata.obs["batch"].astype(str)
+
+markers_to_correct = adata.var[adata.var["to_correct"]].index.to_numpy()
+
+adata = adata[:, markers_to_correct]
+
+print("Randomise features", flush=True)
+integrated = _randomize_features(
+    adata.layers["preprocessed"],
+    partition=adata.obs["cell_type"],
+)
+
+# create new anndata
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    layers={"integrated": integrated},
+    uns={
+        "dataset_id": adata.uns["dataset_id"],
+        "method_id": meta["name"],
+        "parameters": {},
+    },
+)
+
+print("Write output AnnData to file", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
diff --git a/src/control_methods/utils.py b/src/control_methods/utils.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+def _randomize_features(X, partition=None):
+    """
+    Taken and adapted from opsca-v1:
+    https://github.com/openproblems-bio/openproblems/blob/acf5c95a7306b819c4a13972783433d0a48f769b/openproblems/tasks/_batch_integration/_common/methods/baseline.py#L13
+    """
+    X_out = X.copy()
+    if partition is None:
+        partition = np.full(X.shape[0], 0)
+    else:
+        partition = np.asarray(partition)
+    for partition_name in np.unique(partition):
+        partition_idx = np.argwhere(partition == partition_name).flatten()
+        X_out[partition_idx] = X[np.random.permutation(partition_idx)]
+    return X_out