diff --git a/CHANGELOG.md b/CHANGELOG.md index 0feb49c..52f160e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ - `control_methods/shuffle_integration_by_batch` - `control_methods/shuffle_integration_by_cell_type` +* Added `metrics/emd_per_samples` component (PR #9). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml index 81dbf9c..f8fe667 100644 --- a/src/api/file_score.yaml +++ b/src/api/file_score.yaml @@ -1,5 +1,5 @@ type: file -example: "resources_test/task_cyto_batch_integration/cxg_mouse_pancreas_atlas/score.h5ad" +example: "resources_test/task_cyto_batch_integration/starter_file/score.h5ad" label: Score summary: "File indicating the score of a metric." info: @@ -11,12 +11,13 @@ info: description: "A unique identifier for the dataset" required: true - type: string - name: normalization_id - description: "Which normalization was used" + name: method_id + description: "A unique identifier for the batch correction method" required: true - type: string - name: method_id - description: "A unique identifier for the method" + name: sample_ids + description: "The samples assessed by the metric" + multiple: true required: true - type: string name: metric_ids @@ -25,6 +26,6 @@ info: required: true - type: double name: metric_values - description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'." + description: "The metric values obtained. Must be of same length as 'metric_ids'." multiple: true required: true \ No newline at end of file diff --git a/src/metrics/emd_per_samples/config.vsh.yaml b/src/metrics/emd_per_samples/config.vsh.yaml new file mode 100644 index 0000000..d3ca774 --- /dev/null +++ b/src/metrics/emd_per_samples/config.vsh.yaml @@ -0,0 +1,91 @@ +# The API specifies which type of component this is. +# It contains specifications for: +# - The input/output files +# - Common parameters +# - A unit test +__merge__: ../../api/comp_metric.yaml + +# A unique identifier for your component (required). +# Can contain only lowercase letters or underscores. +name: emd_per_samples + +# Metadata for your component +info: + metrics: + # A unique identifier for your metric (required). + # Can contain only lowercase letters or underscores. + - name: emd_per_samples + # A relatively short label, used when rendering visualisarions (required) + label: EMD Per Samples + # A one sentence summary of how this metric works (required). Used when + # rendering summary tables. + summary: "Earth Mover Distance to compute differences in marker expression across two samples." + # A multi-line description of how this component works (required). Used + # when rendering reference documentation. + description: | + Earth Mover Distance (EMD) is a metric designed for comparing two distributions. + It is also known as the Wasserstein metric. + references: + doi: + - 10.1023/A:1026543900054 + links: + # URL to the documentation for this metric (required). + documentation: https://cytonormpy.readthedocs.io/en/latest/generated/cytonormpy.emd_comparison_from_anndata.html + # URL to the code repository for this metric (required). + repository: https://github.com/TarikExner/CytoNormPy + # The minimum possible value for this metric (required) + min: 0 + # The maximum possible value for this metric (required) + max: .inf + # Whether a higher value represents a 'better' solution (required) + maximize: false + # Note: need this if we have component specific argument with no default. + # When running the actual command, either split the sample name by ; + # so Tube1_Batch1_WT;Tube1_Batch2_WT + # or repeat the flag twice. So --samples_to_compare Tube1_Batch1_WT + # --samples_to_compare Tube1_Batch2_WT + test_setup: + starter_file: + samples_to_compare: + - Tube1_Batch1_WT + - Tube1_Batch2_WT + +# Component-specific parameters (optional) +arguments: + - name: "--samples_to_compare" + type: "string" + description: 2 samples to compare. Separate the sample names by comma + required: true + multiple: true + - name: "--layer" + type: "string" + default: "integrated" + description: The layer in input anndata containing the marker expression + +# Resources required to run the component +resources: + # The script of your component (required) + - type: python_script + path: script.py + # Additional resources your script needs (optional) + # - type: file + # path: weights.pt + +engines: + # Specifications for the Docker image for this component. + - type: docker + image: openproblems/base_python:1.0.0 + # Add custom dependencies here (optional). For more information, see + # https://viash.io/reference/config/engines/docker/#setup . + setup: + - type: python + packages: [anndata] + github: [TarikExner/CytoNormPy] + +runners: + # This platform allows running the component natively + - type: executable + # Allows turning the component into a Nextflow module / pipeline. + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/metrics/emd_per_samples/script.py b/src/metrics/emd_per_samples/script.py new file mode 100644 index 0000000..80571e7 --- /dev/null +++ b/src/metrics/emd_per_samples/script.py @@ -0,0 +1,57 @@ +import anndata as ad +import cytonormpy as cnp + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + "input_integrated": "resources_test/task_cyto_batch_integration/starter_file/integrated.h5ad", + "output": "output.h5ad", + "samples_to_compare": "Tube1_Batch1_WT,Tube1_Batch2_WT", + "layer": "integrated", +} +meta = {"name": "emd_per_samples"} +## VIASH END + +print("Reading input files", flush=True) + +adata = ad.read_h5ad(par["input_integrated"]) + +samples_to_compare = [x.strip() for x in par["samples_to_compare"]] + +layer = par["layer"] + +markers_to_assess = adata.var[adata.var["to_correct"]].index.to_numpy() + +print("Compute metrics", flush=True) + +# have to change the "sample" column to file_name for emd_comparison_from_anndata to work. +# Otherwise the _calculate_emd_per_frame used in cytonormpy will error because they +# harcoded the column file_name and use it in assert. +# See line 176 of https://github.com/TarikExner/CytoNormPy/blob/main/cytonormpy/_evaluation/_emd_utils.py#L173 +adata.obs["file_name"] = adata.obs["sample"] + +df = cnp.emd_from_anndata( + adata=adata, + file_list=samples_to_compare, + channels=markers_to_assess, + layer=layer, + sample_identifier_column="file_name", +) + +uns_metric_ids = [f"EMD_per_samples_{x}" for x in df.columns] +uns_metric_values = df.loc["all_cells"].to_numpy() +uns_method_id = adata.uns["method_id"] if "method_id" in adata.uns else "unintegrated" + + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + uns={ + "dataset_id": adata.uns["dataset_id"], + "method_id": uns_method_id, + "sample_ids": samples_to_compare, + "metric_ids": uns_metric_ids, + "metric_values": uns_metric_values, + } +) +output.write_h5ad(par["output"], compression="gzip")