diff --git a/CHANGELOG.md b/CHANGELOG.md index 557813c..0feb49c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,11 @@ * Added `methods/limma_remove_batch_effect` component (PR #7). +* Added three negative control methods (PR #8): + - `control_methods/shuffle_integration` + - `control_methods/shuffle_integration_by_batch` + - `control_methods/shuffle_integration_by_cell_type` + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/control_methods/shuffle_integration/config.vsh.yaml b/src/control_methods/shuffle_integration/config.vsh.yaml new file mode 100644 index 0000000..b3df8a6 --- /dev/null +++ b/src/control_methods/shuffle_integration/config.vsh.yaml @@ -0,0 +1,17 @@ +__merge__: /src/api/comp_control_method.yaml +name: shuffle_integration +label: Shuffle integration +summary: Integrations are randomly permuted +description: Integrations are randomly permuted +resources: + - type: python_script + path: script.py + - path: /src/control_methods/utils.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/control_methods/shuffle_integration/script.py b/src/control_methods/shuffle_integration/script.py new file mode 100644 index 0000000..d943211 --- /dev/null +++ b/src/control_methods/shuffle_integration/script.py @@ -0,0 +1,43 @@ +import anndata as ad +import sys + +## VIASH START +par = { + "input_unintegrated": "resources_test/task_cyto_batch_integration/starter_file/unintegrated_censored.h5ad", + "output": "output.h5ad", +} +meta = {"name": "harmonypy"} +## VIASH END + +print("Importing helper functions", flush=True) +sys.path.append(meta["resources_dir"]) +from utils import _randomize_features + +print("Reading and preparing input files", flush=True) +adata = ad.read_h5ad(par["input_unintegrated"]) + +adata.obs["batch_str"] = adata.obs["batch"].astype(str) + +markers_to_correct = adata.var[adata.var["to_correct"]].index.to_numpy() + +adata = adata[:, markers_to_correct] + +print("Randomise features", flush=True) +integrated = _randomize_features( + adata.layers["preprocessed"] +) + +# create new anndata +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + layers={"integrated": integrated}, + uns={ + "dataset_id": adata.uns["dataset_id"], + "method_id": meta["name"], + "parameters": {}, + }, +) + +print("Write output AnnData to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml b/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml new file mode 100644 index 0000000..e249ef8 --- /dev/null +++ b/src/control_methods/shuffle_integration_by_batch/config.vsh.yaml @@ -0,0 +1,17 @@ +__merge__: /src/api/comp_control_method.yaml +name: shuffle_integration_by_batch +label: Shuffle integration by batch +summary: Integrations are randomly permuted within each batch +description: Integrations are randomly permuted within each batch +resources: + - type: python_script + path: script.py + - path: /src/control_methods/utils.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/control_methods/shuffle_integration_by_batch/script.py b/src/control_methods/shuffle_integration_by_batch/script.py new file mode 100644 index 0000000..07c8b18 --- /dev/null +++ b/src/control_methods/shuffle_integration_by_batch/script.py @@ -0,0 +1,44 @@ +import anndata as ad +import sys + +## VIASH START +par = { + "input_unintegrated": "resources_test/task_cyto_batch_integration/starter_file/unintegrated_censored.h5ad", + "output": "output.h5ad", +} +meta = {"name": "harmonypy"} +## VIASH END + +print("Importing helper functions", flush=True) +sys.path.append(meta["resources_dir"]) +from utils import _randomize_features + +print("Reading and preparing input files", flush=True) +adata = ad.read_h5ad(par["input_unintegrated"]) + +adata.obs["batch_str"] = adata.obs["batch"].astype(str) + +markers_to_correct = adata.var[adata.var["to_correct"]].index.to_numpy() + +adata = adata[:, markers_to_correct] + +print("Randomise features", flush=True) +integrated = _randomize_features( + adata.layers["preprocessed"], + partition=adata.obs["batch"], +) + +# create new anndata +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + layers={"integrated": integrated}, + uns={ + "dataset_id": adata.uns["dataset_id"], + "method_id": meta["name"], + "parameters": {}, + }, +) + +print("Write output AnnData to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml b/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml new file mode 100644 index 0000000..c81a246 --- /dev/null +++ b/src/control_methods/shuffle_integration_by_cell_type/config.vsh.yaml @@ -0,0 +1,17 @@ +__merge__: /src/api/comp_control_method.yaml +name: shuffle_integration_by_cell_type +label: Shuffle integration by cell type +summary: Integrations are randomly permuted within each cell type +description: Integrations are randomly permuted within each cell type +resources: + - type: python_script + path: script.py + - path: /src/control_methods/utils.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/control_methods/shuffle_integration_by_cell_type/script.py b/src/control_methods/shuffle_integration_by_cell_type/script.py new file mode 100644 index 0000000..8165d57 --- /dev/null +++ b/src/control_methods/shuffle_integration_by_cell_type/script.py @@ -0,0 +1,44 @@ +import anndata as ad +import sys + +## VIASH START +par = { + "input_unintegrated": "resources_test/task_cyto_batch_integration/starter_file/unintegrated_censored.h5ad", + "output": "output.h5ad", +} +meta = {"name": "harmonypy"} +## VIASH END + +print("Importing helper functions", flush=True) +sys.path.append(meta["resources_dir"]) +from utils import _randomize_features + +print("Reading and preparing input files", flush=True) +adata = ad.read_h5ad(par["input_unintegrated"]) + +adata.obs["batch_str"] = adata.obs["batch"].astype(str) + +markers_to_correct = adata.var[adata.var["to_correct"]].index.to_numpy() + +adata = adata[:, markers_to_correct] + +print("Randomise features", flush=True) +integrated = _randomize_features( + adata.layers["preprocessed"], + partition=adata.obs["cell_type"], +) + +# create new anndata +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + layers={"integrated": integrated}, + uns={ + "dataset_id": adata.uns["dataset_id"], + "method_id": meta["name"], + "parameters": {}, + }, +) + +print("Write output AnnData to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/control_methods/utils.py b/src/control_methods/utils.py new file mode 100644 index 0000000..b4f4673 --- /dev/null +++ b/src/control_methods/utils.py @@ -0,0 +1,16 @@ +import numpy as np + +def _randomize_features(X, partition=None): + """ + Taken and adapted from opsca-v1: + https://github.com/openproblems-bio/openproblems/blob/acf5c95a7306b819c4a13972783433d0a48f769b/openproblems/tasks/_batch_integration/_common/methods/baseline.py#L13 + """ + X_out = X.copy() + if partition is None: + partition = np.full(X.shape[0], 0) + else: + partition = np.asarray(partition) + for partition_name in np.unique(partition): + partition_idx = np.argwhere(partition == partition_name).flatten() + X_out[partition_idx] = X[np.random.permutation(partition_idx)] + return X_out