From bcc302b517182bac98d8f936651256348a95576f Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Fri, 8 Nov 2024 11:54:50 +0100 Subject: [PATCH 1/5] Pre-filter batches in hvg overlap metric Remove those with insufficient genes before running the scoring function --- src/metrics/hvg_overlap/script.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/metrics/hvg_overlap/script.py b/src/metrics/hvg_overlap/script.py index b902fe08..61e980c0 100644 --- a/src/metrics/hvg_overlap/script.py +++ b/src/metrics/hvg_overlap/script.py @@ -1,6 +1,9 @@ import sys import anndata as ad +import numpy as np +import scanpy as sc from scib.metrics import hvg_overlap +from scib.utils import split_batches ## VIASH START par = { @@ -36,6 +39,20 @@ print("Copy batch information", flush=True) adata_integrated.obs['batch'] = adata_solution.obs['batch'] +print("Remove batches with insufficient genes", flush=True) +adata_list = split_batches(adata_solution, "batch", hvg=adata_integrated.var_names) +skip_batches = [] +for adata_batch in adata_list: + sc.pp.filter_genes(adata_batch, min_cells=1) + n_hvg_tmp = np.minimum(500, int(0.5 * adata_batch.n_vars)) + if n_hvg_tmp < 500: + batch = adata_batch.obs["batch"][0] + print(f"Batch '{batch}' has insufficient genes (0.5 * {adata_batch.n_vars} < 500) and will be skipped", flush=True) + skip_batches.append(batch) + +adata_solution = adata_solution[~adata_solution.obs['batch'].isin(skip_batches)] +adata_integrated = adata_integrated[~adata_integrated.obs['batch'].isin(skip_batches)] + print('Compute score', flush=True) score = hvg_overlap( adata_solution, From bd8769367d7e9aa5c9cc7fca0f1d3d49ae38ade1 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Fri, 8 Nov 2024 11:56:07 +0100 Subject: [PATCH 2/5] Style hvg overlap script --- src/metrics/hvg_overlap/script.py | 59 ++++++++++++++----------------- 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/src/metrics/hvg_overlap/script.py b/src/metrics/hvg_overlap/script.py index 61e980c0..2db0f9b8 100644 --- a/src/metrics/hvg_overlap/script.py +++ b/src/metrics/hvg_overlap/script.py @@ -1,4 +1,5 @@ import sys + import anndata as ad import numpy as np import scanpy as sc @@ -7,37 +8,30 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', - 'input_solution': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad', - 'output': 'output.h5ad', -} -meta = { - 'name': 'foo', - "resources_dir": "src/utils" + "input_integrated": "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad", + "input_solution": "resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad", + "output": "output.h5ad", } +meta = {"name": "foo", "resources_dir": "src/utils"} ## VIASH END sys.path.append(meta["resources_dir"]) from read_anndata_partial import read_anndata -print('Read input', flush=True) +print("Read input", flush=True) adata_solution = read_anndata( - par['input_solution'], - X='layers/normalized', - obs='obs', - var='var', - uns='uns' + par["input_solution"], X="layers/normalized", obs="obs", var="var", uns="uns" ) adata_integrated = read_anndata( - par['input_integrated'], - X='layers/corrected_counts', - obs='obs', - var='var', - uns='uns' + par["input_integrated"], + X="layers/corrected_counts", + obs="obs", + var="var", + uns="uns", ) print("Copy batch information", flush=True) -adata_integrated.obs['batch'] = adata_solution.obs['batch'] +adata_integrated.obs["batch"] = adata_solution.obs["batch"] print("Remove batches with insufficient genes", flush=True) adata_list = split_batches(adata_solution, "batch", hvg=adata_integrated.var_names) @@ -47,27 +41,26 @@ n_hvg_tmp = np.minimum(500, int(0.5 * adata_batch.n_vars)) if n_hvg_tmp < 500: batch = adata_batch.obs["batch"][0] - print(f"Batch '{batch}' has insufficient genes (0.5 * {adata_batch.n_vars} < 500) and will be skipped", flush=True) + print( + f"Batch '{batch}' has insufficient genes (0.5 * {adata_batch.n_vars} < 500) and will be skipped", + flush=True, + ) skip_batches.append(batch) -adata_solution = adata_solution[~adata_solution.obs['batch'].isin(skip_batches)] -adata_integrated = adata_integrated[~adata_integrated.obs['batch'].isin(skip_batches)] +adata_solution = adata_solution[~adata_solution.obs["batch"].isin(skip_batches)] +adata_integrated = adata_integrated[~adata_integrated.obs["batch"].isin(skip_batches)] -print('Compute score', flush=True) -score = hvg_overlap( - adata_solution, - adata_integrated, - batch_key="batch" -) +print("Compute score", flush=True) +score = hvg_overlap(adata_solution, adata_integrated, batch_key="batch") print("Create output AnnData object", flush=True) output = ad.AnnData( uns={ - "dataset_id": adata_solution.uns['dataset_id'], - 'normalization_id': adata_solution.uns['normalization_id'], - "method_id": adata_integrated.uns['method_id'], - "metric_ids": [meta['name']], - "metric_values": [score] + "dataset_id": adata_solution.uns["dataset_id"], + "normalization_id": adata_solution.uns["normalization_id"], + "method_id": adata_integrated.uns["method_id"], + "metric_ids": [meta["name"]], + "metric_values": [score], } ) From 4e21e407d0064225a6be2c88a6f586ad5a17d7ad Mon Sep 17 00:00:00 2001 From: Kai Waldrant Date: Fri, 8 Nov 2024 12:05:37 +0100 Subject: [PATCH 3/5] fix submodule --- common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common b/common index b0239da4..65e05af6 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit b0239da4bab97d57ab5116fefb5b64bbde063db4 +Subproject commit 65e05af68a11ee87853fcf7a3c6b579001f21abe From 06b7603a8aa5f5fb0e0b384a5d321387f18dadd9 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Fri, 8 Nov 2024 11:54:50 +0100 Subject: [PATCH 4/5] Pre-filter batches in hvg overlap metric Remove those with insufficient genes before running the scoring function --- src/metrics/hvg_overlap/script.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/metrics/hvg_overlap/script.py b/src/metrics/hvg_overlap/script.py index b902fe08..61e980c0 100644 --- a/src/metrics/hvg_overlap/script.py +++ b/src/metrics/hvg_overlap/script.py @@ -1,6 +1,9 @@ import sys import anndata as ad +import numpy as np +import scanpy as sc from scib.metrics import hvg_overlap +from scib.utils import split_batches ## VIASH START par = { @@ -36,6 +39,20 @@ print("Copy batch information", flush=True) adata_integrated.obs['batch'] = adata_solution.obs['batch'] +print("Remove batches with insufficient genes", flush=True) +adata_list = split_batches(adata_solution, "batch", hvg=adata_integrated.var_names) +skip_batches = [] +for adata_batch in adata_list: + sc.pp.filter_genes(adata_batch, min_cells=1) + n_hvg_tmp = np.minimum(500, int(0.5 * adata_batch.n_vars)) + if n_hvg_tmp < 500: + batch = adata_batch.obs["batch"][0] + print(f"Batch '{batch}' has insufficient genes (0.5 * {adata_batch.n_vars} < 500) and will be skipped", flush=True) + skip_batches.append(batch) + +adata_solution = adata_solution[~adata_solution.obs['batch'].isin(skip_batches)] +adata_integrated = adata_integrated[~adata_integrated.obs['batch'].isin(skip_batches)] + print('Compute score', flush=True) score = hvg_overlap( adata_solution, From 6542acbceb0d04c229c9c0fbec81479f8d91c528 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Fri, 8 Nov 2024 11:56:07 +0100 Subject: [PATCH 5/5] Style hvg overlap script --- src/metrics/hvg_overlap/script.py | 59 ++++++++++++++----------------- 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/src/metrics/hvg_overlap/script.py b/src/metrics/hvg_overlap/script.py index 61e980c0..2db0f9b8 100644 --- a/src/metrics/hvg_overlap/script.py +++ b/src/metrics/hvg_overlap/script.py @@ -1,4 +1,5 @@ import sys + import anndata as ad import numpy as np import scanpy as sc @@ -7,37 +8,30 @@ ## VIASH START par = { - 'input_integrated': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad', - 'input_solution': 'resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad', - 'output': 'output.h5ad', -} -meta = { - 'name': 'foo', - "resources_dir": "src/utils" + "input_integrated": "resources_test/task_batch_integration/cxg_immune_cell_atlas/integrated_full.h5ad", + "input_solution": "resources_test/task_batch_integration/cxg_immune_cell_atlas/solution.h5ad", + "output": "output.h5ad", } +meta = {"name": "foo", "resources_dir": "src/utils"} ## VIASH END sys.path.append(meta["resources_dir"]) from read_anndata_partial import read_anndata -print('Read input', flush=True) +print("Read input", flush=True) adata_solution = read_anndata( - par['input_solution'], - X='layers/normalized', - obs='obs', - var='var', - uns='uns' + par["input_solution"], X="layers/normalized", obs="obs", var="var", uns="uns" ) adata_integrated = read_anndata( - par['input_integrated'], - X='layers/corrected_counts', - obs='obs', - var='var', - uns='uns' + par["input_integrated"], + X="layers/corrected_counts", + obs="obs", + var="var", + uns="uns", ) print("Copy batch information", flush=True) -adata_integrated.obs['batch'] = adata_solution.obs['batch'] +adata_integrated.obs["batch"] = adata_solution.obs["batch"] print("Remove batches with insufficient genes", flush=True) adata_list = split_batches(adata_solution, "batch", hvg=adata_integrated.var_names) @@ -47,27 +41,26 @@ n_hvg_tmp = np.minimum(500, int(0.5 * adata_batch.n_vars)) if n_hvg_tmp < 500: batch = adata_batch.obs["batch"][0] - print(f"Batch '{batch}' has insufficient genes (0.5 * {adata_batch.n_vars} < 500) and will be skipped", flush=True) + print( + f"Batch '{batch}' has insufficient genes (0.5 * {adata_batch.n_vars} < 500) and will be skipped", + flush=True, + ) skip_batches.append(batch) -adata_solution = adata_solution[~adata_solution.obs['batch'].isin(skip_batches)] -adata_integrated = adata_integrated[~adata_integrated.obs['batch'].isin(skip_batches)] +adata_solution = adata_solution[~adata_solution.obs["batch"].isin(skip_batches)] +adata_integrated = adata_integrated[~adata_integrated.obs["batch"].isin(skip_batches)] -print('Compute score', flush=True) -score = hvg_overlap( - adata_solution, - adata_integrated, - batch_key="batch" -) +print("Compute score", flush=True) +score = hvg_overlap(adata_solution, adata_integrated, batch_key="batch") print("Create output AnnData object", flush=True) output = ad.AnnData( uns={ - "dataset_id": adata_solution.uns['dataset_id'], - 'normalization_id': adata_solution.uns['normalization_id'], - "method_id": adata_integrated.uns['method_id'], - "metric_ids": [meta['name']], - "metric_values": [score] + "dataset_id": adata_solution.uns["dataset_id"], + "normalization_id": adata_solution.uns["normalization_id"], + "method_id": adata_integrated.uns["method_id"], + "metric_ids": [meta["name"]], + "metric_values": [score], } )