From 473b6033d351cc7fecebd6aeb7849e6f0ba4a9d6 Mon Sep 17 00:00:00 2001 From: Givanna Putri Date: Wed, 18 Dec 2024 21:46:10 +1100 Subject: [PATCH 1/3] first commit --- src/api/file_integrated.yaml | 2 +- src/api/file_score.yaml | 13 +-- src/metrics/emd_per_samples/config.vsh.yaml | 91 +++++++++++++++++++++ src/metrics/emd_per_samples/script.py | 57 +++++++++++++ 4 files changed, 156 insertions(+), 7 deletions(-) create mode 100644 src/metrics/emd_per_samples/config.vsh.yaml create mode 100644 src/metrics/emd_per_samples/script.py diff --git a/src/api/file_integrated.yaml b/src/api/file_integrated.yaml index b2c6a34..9882901 100644 --- a/src/api/file_integrated.yaml +++ b/src/api/file_integrated.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/prediction.h5ad" +example: "resources_test/task_cyto_batch_integration/starter_file/integrated.h5ad" label: Integrated summary: "Integrated dataset" info: diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml index 81dbf9c..f8fe667 100644 --- a/src/api/file_score.yaml +++ b/src/api/file_score.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/score.h5ad" +example: "resources_test/task_cyto_batch_integration/starter_file/score.h5ad" label: Score summary: "File indicating the score of a metric." info: @@ -11,12 +11,13 @@ info: description: "A unique identifier for the dataset" required: true - type: string - name: normalization_id - description: "Which normalization was used" + name: method_id + description: "A unique identifier for the batch correction method" required: true - type: string - name: method_id - description: "A unique identifier for the method" + name: sample_ids + description: "The samples assessed by the metric" + multiple: true required: true - type: string name: metric_ids @@ -25,6 +26,6 @@ info: required: true - type: double name: metric_values - description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + description: "The metric values obtained. Must be of same length as 'metric_ids'." multiple: true required: true \ No newline at end of file diff --git a/src/metrics/emd_per_samples/config.vsh.yaml b/src/metrics/emd_per_samples/config.vsh.yaml new file mode 100644 index 0000000..02a826e --- /dev/null +++ b/src/metrics/emd_per_samples/config.vsh.yaml @@ -0,0 +1,91 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_metric.yaml + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: emd_per_samples + +# Metadata for your component +info: + metrics: + # A unique identifier for your metric (required). + # Can contain only lowercase letters or underscores. + - name: emd_per_samples + # A relatively short label, used when rendering visualisarions (required) + label: EMD Per Samples + # A one sentence summary of how this metric works (required). Used when + # rendering summary tables. + summary: "Earth Mover Distance to compute differences in marker expression across two samples." + # A multi-line description of how this component works (required). Used + # when rendering reference documentation. + description: | + Earth Mover Distance (EMD) is a metric designed for comparing two distributions. + It is also known as the Wasserstein metric. + references: + doi: + - 10.1023/A:1026543900054 + bibtex: + - | + @article{rubner2000earth, + title={The earth mover's distance as a metric for image retrieval}, + author={Rubner, Yossi and Tomasi, Carlo and Guibas, Leonidas J}, + journal={International journal of computer vision}, + volume={40}, + pages={99--121}, + year={2000}, + publisher={Springer} + } + links: + # URL to the documentation for this metric (required). + documentation: https://cytonormpy.readthedocs.io/en/latest/generated/cytonormpy.emd_comparison_from_anndata.html + # URL to the code repository for this metric (required). + repository: https://github.com/TarikExner/CytoNormPy + # The minimum possible value for this metric (required) + min: 0 + # The maximum possible value for this metric (required) + max: .inf + # Whether a higher value represents a 'better' solution (required) + maximize: false + +# Component-specific parameters (optional) +arguments: + - name: "--samples_to_compare" + type: "string" + default: "Tube1_Batch1_WT,Tube1_Batch2_WT" + description: 2 samples to compare. Separate the sample names by comma + - name: "--layer" + type: "string" + default: "integrated" + description: The layer in input anndata containing the marker expression + +# Resources required to run the component +resources: + # The script of your component (required) + - type: python_script + path: script.py + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_python:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + setup: + - type: python + packages: [anndata] + github: [TarikExner/CytoNormPy] + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/metrics/emd_per_samples/script.py b/src/metrics/emd_per_samples/script.py new file mode 100644 index 0000000..0331b36 --- /dev/null +++ b/src/metrics/emd_per_samples/script.py @@ -0,0 +1,57 @@ +import anndata as ad +import cytonormpy as cnp + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + "input_integrated": "resources_test/task_cyto_batch_integration/starter_file/integrated.h5ad", + "output": "output.h5ad", + "samples_to_compare": "Tube1_Batch1_WT,Tube1_Batch2_WT", + "layer": "integrated", +} +meta = {"name": "emd_per_samples"} +## VIASH END + +print("Reading input files", flush=True) + +adata = ad.read_h5ad(par["input_integrated"]) + +samples_to_compare = [x.strip() for x in par["samples_to_compare"].split(",")] + +layer = par["layer"] + +markers_to_assess = adata.var[adata.var["to_correct"]].index.to_numpy() + +print("Compute metrics", flush=True) + +# have to change the "sample" column to file_name for emd_comparison_from_anndata to work. +# Otherwise the _calculate_emd_per_frame used in cytonormpy will error because they +# harcoded the column file_name and use it in assert. +# See line 176 of https://github.com/TarikExner/CytoNormPy/blob/main/cytonormpy/_evaluation/_emd_utils.py#L173 +adata.obs["file_name"] = adata.obs["sample"] + +df = cnp.emd_from_anndata( + adata=adata, + file_list=samples_to_compare, + channels=markers_to_assess, + layer=layer, + sample_identifier_column="file_name", +) + +uns_metric_ids = [f"EMD_per_samples_{x}" for x in df.columns] +uns_metric_values = df.loc["all_cells"].to_numpy() +uns_method_id = adata.uns["method_id"] if "method_id" in adata.uns else "unintegrated" + + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + "dataset_id": adata.uns["dataset_id"], + "method_id": uns_method_id, + "sample_ids": samples_to_compare, + "metric_ids": uns_metric_ids, + "metric_values": uns_metric_values, + } +) +output.write_h5ad(par["output"], compression="gzip") From 60e00fed6078a1880b9ca2c02ed11de057b26470 Mon Sep 17 00:00:00 2001 From: Givanna Putri Date: Wed, 18 Dec 2024 22:03:13 +1100 Subject: [PATCH 2/3] Update CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 557813c..dfba125 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ * Added `methods/limma_remove_batch_effect` component (PR #7). +* Added `metrics/emd_per_samples` component (PR #9). + ## MAJOR CHANGES ## MINOR CHANGES From 8b05f332c4bd03660217ead3a1ef041e752b7214 Mon Sep 17 00:00:00 2001 From: Givanna Putri Date: Thu, 19 Dec 2024 20:48:54 +1100 Subject: [PATCH 3/3] change samples_to_compare default values --- src/metrics/emd_per_samples/config.vsh.yaml | 26 ++++++++++----------- src/metrics/emd_per_samples/script.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/metrics/emd_per_samples/config.vsh.yaml b/src/metrics/emd_per_samples/config.vsh.yaml index 02a826e..d3ca774 100644 --- a/src/metrics/emd_per_samples/config.vsh.yaml +++ b/src/metrics/emd_per_samples/config.vsh.yaml @@ -28,17 +28,6 @@ info: references: doi: - 10.1023/A:1026543900054 - bibtex: - - | - @article{rubner2000earth, - title={The earth mover's distance as a metric for image retrieval}, - author={Rubner, Yossi and Tomasi, Carlo and Guibas, Leonidas J}, - journal={International journal of computer vision}, - volume={40}, - pages={99--121}, - year={2000}, - publisher={Springer} - } links: # URL to the documentation for this metric (required). documentation: https://cytonormpy.readthedocs.io/en/latest/generated/cytonormpy.emd_comparison_from_anndata.html @@ -50,13 +39,24 @@ info: max: .inf # Whether a higher value represents a 'better' solution (required) maximize: false - + # Note: need this if we have component specific argument with no default. + # When running the actual command, either split the sample name by ; + # so Tube1_Batch1_WT;Tube1_Batch2_WT + # or repeat the flag twice. So --samples_to_compare Tube1_Batch1_WT + # --samples_to_compare Tube1_Batch2_WT + test_setup: + starter_file: + samples_to_compare: + - Tube1_Batch1_WT + - Tube1_Batch2_WT + # Component-specific parameters (optional) arguments: - name: "--samples_to_compare" type: "string" - default: "Tube1_Batch1_WT,Tube1_Batch2_WT" description: 2 samples to compare. Separate the sample names by comma + required: true + multiple: true - name: "--layer" type: "string" default: "integrated" diff --git a/src/metrics/emd_per_samples/script.py b/src/metrics/emd_per_samples/script.py index 0331b36..80571e7 100644 --- a/src/metrics/emd_per_samples/script.py +++ b/src/metrics/emd_per_samples/script.py @@ -17,7 +17,7 @@ adata = ad.read_h5ad(par["input_integrated"]) -samples_to_compare = [x.strip() for x in par["samples_to_compare"].split(",")] +samples_to_compare = [x.strip() for x in par["samples_to_compare"]] layer = par["layer"]