From 4eb7181811de736ad2e6d48336c639a66dbc5b84 Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Tue, 17 Sep 2024 22:54:43 +0200
Subject: [PATCH] Add api files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: MalteDLuecken <m.d.luecken@gmail.com>
Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com>
Co-authored-by: Michaela Müller <51025211+mumichae@users.noreply.github.com>
Co-authored-by: Kai Waldrant <kai@data-intuitive.com>
---
 .vscode/settings.json                         |   5 +-
 CHANGELOG.md                                  |   2 +-
 README.md                                     | 376 ++++++++++++++++--
 _viash.yaml                                   | 119 +++---
 common                                        |   2 +-
 scripts/create_resources/resources.sh         |   8 +-
 scripts/create_resources/test_resources.sh    |  18 +-
 scripts/run_benchmark/run_full_local.sh       |   4 +-
 scripts/run_benchmark/run_full_seqeracloud.sh |  10 +-
 scripts/run_benchmark/run_test_local.sh       |   4 +-
 scripts/run_benchmark/run_test_seqeracloud.sh |  10 +-
 src/api/comp_control_method.yaml              |  33 +-
 src/api/comp_data_processor.yaml              |  31 --
 src/api/comp_method.yaml                      |  26 +-
 src/api/comp_metric.yaml                      |  24 +-
 src/api/comp_process_dataset.yaml             |  43 ++
 src/api/file_common_dataset.yaml              |  22 +-
 ...file_train_h5ad.yaml => file_dataset.yaml} |  42 +-
 src/api/file_integrated.yaml                  |  53 +++
 src/api/file_prediction.yaml                  |  26 --
 src/api/file_score.yaml                       |  11 +-
 src/api/file_solution.yaml                    |  34 +-
 src/api/file_test_h5ad.yaml                   |  45 ---
 thumbnail.svg                                 |   1 +
 24 files changed, 662 insertions(+), 287 deletions(-)
 delete mode 100644 src/api/comp_data_processor.yaml
 create mode 100644 src/api/comp_process_dataset.yaml
 rename src/api/{file_train_h5ad.yaml => file_dataset.yaml} (51%)
 create mode 100644 src/api/file_integrated.yaml
 delete mode 100644 src/api/file_prediction.yaml
 delete mode 100644 src/api/file_test_h5ad.yaml
 create mode 100644 thumbnail.svg

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 75292106..a3485e17 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -5,6 +5,9 @@
     "common/schemas/task_config.yaml": "_viash.yaml",
     "common/schemas/task_method.yaml": "**/methods/**/config.vsh.yaml",
     "common/schemas/task_control_method.yaml": "**/control_methods/**/config.vsh.yaml",
-    "common/schemas/task_metric.yaml": "**/metrics/**/config.vsh.yaml"
+    "common/schemas/task_metric.yaml": "**/metrics/**/config.vsh.yaml",
+    "/home/rcannood/.viash/releases/0.9.0/schema.json": [
+      "*.vsh.yaml"
+    ]
   }
 }
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 38397448..2ba5cf94 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-# task_template x.y.z
+# task_batch_integration x.y.z
 
 ## BREAKING CHANGES
 
diff --git a/README.md b/README.md
index da3ffe51..f2bb7859 100644
--- a/README.md
+++ b/README.md
@@ -1,37 +1,367 @@
-# Task Template
+# Batch Integration
 
-This repo is a template to create a new task for the OpenProblems v2. This repo contains several example files and components that can be used when updated with the task info.
 
-> [!WARNING] 
-> This README will be overwritten when performing the `create_task_readme` script.
+<!--
+This file is automatically generated from the tasks's api/*.yaml files.
+Do not edit this file directly.
+-->
 
-## Create a repository from this template
+Remove unwanted batch effects from scRNA-seq data while retaining
+biologically meaningful variation.
 
-> [!IMPORTANT] 
-> Before creating a new repository, make sure you are part of the OpenProblems task team. This will be done when you create an issue for the task and you get the go ahead to create the task.
-> For more information on how to create a new task, check out the [Create a new task](https://openproblems.bio/documentation/create_task/) documentation.
+Repository:
+[openproblems-bio/task_batch_integration](https://github.com/openproblems-bio/task_batch_integration)
 
-The instructions below will guide you through creating a new repository from this template ([creating-a-repository-from-a-template](https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-repository-from-a-template#creating-a-repository-from-a-template)).
+## Description
 
+As single-cell technologies advance, single-cell datasets are growing
+both in size and complexity. Especially in consortia such as the Human
+Cell Atlas, individual studies combine data from multiple labs, each
+sequencing multiple individuals possibly with different technologies.
+This gives rise to complex batch effects in the data that must be
+computationally removed to perform a joint analysis. These batch
+integration methods must remove the batch effect while not removing
+relevant biological information. Currently, over 200 tools exist that
+aim to remove batch effects scRNA-seq datasets \[@zappia2018exploring\].
+These methods balance the removal of batch effects with the conservation
+of nuanced biological information in different ways. This abundance of
+tools has complicated batch integration method choice, leading to
+several benchmarks on this topic \[@luecken2020benchmarking;
+@tran2020benchmark; @chazarragil2021flexible; @mereu2020benchmarking\].
+Yet, benchmarks use different metrics, method implementations and
+datasets. Here we build a living benchmarking task for batch integration
+methods with the vision of improving the consistency of method
+evaluation.
 
-* Click the "Use this template" button on the top right of the repository.
-* Use the Owner dropdown menu to select the `openproblems-bio` account.
-* Type a name for your repository (task_...), and a description.
-* Set the repository visibility to public.
-* Click "Create repository from template".
+In this task we evaluate batch integration methods on their ability to
+remove batch effects in the data while conserving variation attributed
+to biological effects. As input, methods require either normalised or
+unnormalised data with multiple batches and consistent cell type labels.
+The batch integrated output can be a feature matrix, a low dimensional
+embedding and/or a neighbourhood graph. The respective batch-integrated
+representation is then evaluated using sets of metrics that capture how
+well batch effects are removed and whether biological variance is
+conserved. We have based this particular task on the latest, and most
+extensive benchmark of single-cell data integration methods.
 
-## Clone the repository
+## Authors & contributors
 
-To clone the repository with the submodule files, you can use the following command:
+| name              | roles              |
+|:------------------|:-------------------|
+| Michaela Mueller  | maintainer, author |
+| Malte Luecken     | author             |
+| Daniel Strobl     | author             |
+| Robrecht Cannoodt | contributor        |
+| Scott Gigante     | contributor        |
+| Kai Waldrant      | contributor        |
+| Nartin Kim        | contributor        |
 
-```bash
-git clone --recursive git@github.com:openproblems-bio/<repo_name>.git
+## API
+
+``` mermaid
+flowchart LR
+  file_common_dataset("Common Dataset")
+  comp_process_dataset[/"Data processor"/]
+  file_dataset("Dataset")
+  file_solution("Solution")
+  comp_control_method[/"Control method"/]
+  comp_method[/"Method"/]
+  comp_metric[/"Metric"/]
+  file_integrated("Integrated Dataset")
+  file_score("Score")
+  file_common_dataset---comp_process_dataset
+  comp_process_dataset-->file_dataset
+  comp_process_dataset-->file_solution
+  file_dataset---comp_control_method
+  file_dataset---comp_method
+  file_solution---comp_control_method
+  file_solution---comp_metric
+  comp_control_method-->file_integrated
+  comp_method-->file_integrated
+  comp_metric-->file_score
+  file_integrated---comp_metric
 ```
->[!NOTE]
-> If somehow there are no files visible in the submodule after cloning using the above command. Check the instructions [here](common/README.md).
 
-## What to do next
+## File format: Common Dataset
+
+A subset of the common dataset.
+
+Example file:
+`resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad`
+
+Format:
+
+<div class="small">
+
+    AnnData object
+     obs: 'cell_type', 'batch'
+     var: 'hvg', 'hvg_score', 'feature_name'
+     obsm: 'X_pca'
+     obsp: 'knn_distances', 'knn_connectivities'
+     layers: 'counts', 'normalized'
+     uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'knn'
+
+</div>
+
+Data structure:
+
+<div class="small">
+
+| Slot | Type | Description |
+|:---|:---|:---|
+| `obs["cell_type"]` | `string` | Cell type information. |
+| `obs["batch"]` | `string` | Batch information. |
+| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. |
+| `var["hvg_score"]` | `double` | A ranking of the features by hvg. |
+| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. |
+| `obsm["X_pca"]` | `double` | The resulting PCA embedding. |
+| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. |
+| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. |
+| `layers["counts"]` | `integer` | Raw counts. |
+| `layers["normalized"]` | `double` | Normalized expression values. |
+| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
+| `uns["dataset_name"]` | `string` | Nicely formatted name. |
+| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. |
+| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. |
+| `uns["dataset_summary"]` | `string` | Short description of the dataset. |
+| `uns["dataset_description"]` | `string` | Long description of the dataset. |
+| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. |
+| `uns["normalization_id"]` | `string` | Which normalization was used. |
+| `uns["knn"]` | `object` | (*Optional*) Supplementary K nearest neighbors data. |
+
+</div>
+
+## Component type: Data processor
+
+A label projection dataset processor.
+
+Arguments:
+
+<div class="small">
+
+| Name | Type | Description |
+|:---|:---|:---|
+| `--input` | `file` | A subset of the common dataset. |
+| `--output_dataset` | `file` | (*Output*) Unintegrated AnnData HDF5 file. |
+| `--output_solution` | `file` | (*Output*) Uncensored dataset containing the true labels. |
+| `--obs_label` | `string` | (*Optional*) NA. Default: `cell_type`. |
+| `--obs_batch` | `string` | (*Optional*) NA. Default: `batch`. |
+| `--hvgs` | `integer` | (*Optional*) NA. Default: `2000`. |
+| `--subset_hvg` | `boolean` | (*Optional*) NA. Default: `FALSE`. |
+
+</div>
+
+## File format: Dataset
+
+Unintegrated AnnData HDF5 file.
+
+Example file:
+`resources_test/batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad`
+
+Format:
+
+<div class="small">
+
+    AnnData object
+     obs: 'batch', 'label'
+     var: 'hvg', 'hvg_score', 'feature_name'
+     obsm: 'X_pca'
+     obsp: 'knn_distances', 'knn_connectivities'
+     layers: 'counts', 'normalized'
+     uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'knn'
+
+</div>
+
+Data structure:
+
+<div class="small">
+
+| Slot | Type | Description |
+|:---|:---|:---|
+| `obs["batch"]` | `string` | Batch information. |
+| `obs["label"]` | `string` | label information. |
+| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. |
+| `var["hvg_score"]` | `double` | A ranking of the features by hvg. |
+| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. |
+| `obsm["X_pca"]` | `double` | The resulting PCA embedding. |
+| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. |
+| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. |
+| `layers["counts"]` | `integer` | Raw counts. |
+| `layers["normalized"]` | `double` | Normalized expression values. |
+| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
+| `uns["normalization_id"]` | `string` | Which normalization was used. |
+| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. |
+| `uns["knn"]` | `object` | Supplementary K nearest neighbors data. |
+
+</div>
+
+## File format: Solution
+
+Uncensored dataset containing the true labels.
+
+Example file:
+`resources_test/batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad`
+
+Format:
+
+<div class="small">
+
+    AnnData object
+     obs: 'batch', 'label'
+     var: 'hvg', 'hvg_score', 'feature_name'
+     obsm: 'X_pca'
+     obsp: 'knn_distances', 'knn_connectivities'
+     layers: 'counts', 'normalized'
+     uns: 'dataset_id', 'dataset_name', 'dataset_url', 'dataset_reference', 'dataset_summary', 'dataset_description', 'dataset_organism', 'normalization_id', 'knn'
+
+</div>
+
+Data structure:
+
+<div class="small">
+
+| Slot | Type | Description |
+|:---|:---|:---|
+| `obs["batch"]` | `string` | Batch information. |
+| `obs["label"]` | `string` | label information. |
+| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. |
+| `var["hvg_score"]` | `double` | A ranking of the features by hvg. |
+| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. |
+| `obsm["X_pca"]` | `double` | The resulting PCA embedding. |
+| `obsp["knn_distances"]` | `double` | K nearest neighbors distance matrix. |
+| `obsp["knn_connectivities"]` | `double` | K nearest neighbors connectivities matrix. |
+| `layers["counts"]` | `integer` | Raw counts. |
+| `layers["normalized"]` | `double` | Normalized expression values. |
+| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
+| `uns["dataset_name"]` | `string` | Nicely formatted name. |
+| `uns["dataset_url"]` | `string` | (*Optional*) Link to the original source of the dataset. |
+| `uns["dataset_reference"]` | `string` | (*Optional*) Bibtex reference of the paper in which the dataset was published. |
+| `uns["dataset_summary"]` | `string` | Short description of the dataset. |
+| `uns["dataset_description"]` | `string` | Long description of the dataset. |
+| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. |
+| `uns["normalization_id"]` | `string` | Which normalization was used. |
+| `uns["knn"]` | `object` | Supplementary K nearest neighbors data. |
+
+</div>
+
+## Component type: Control method
+
+A control method for the batch integration task.
+
+Arguments:
+
+<div class="small">
+
+| Name               | Type   | Description                                    |
+|:-------------------|:-------|:-----------------------------------------------|
+| `--input_dataset`  | `file` | Unintegrated AnnData HDF5 file.                |
+| `--input_solution` | `file` | Uncensored dataset containing the true labels. |
+| `--output`         | `file` | (*Output*) An integrated AnnData dataset.      |
+
+</div>
+
+## Component type: Method
+
+A method for the batch integration task.
+
+Arguments:
+
+<div class="small">
+
+| Name       | Type   | Description                               |
+|:-----------|:-------|:------------------------------------------|
+| `--input`  | `file` | Unintegrated AnnData HDF5 file.           |
+| `--output` | `file` | (*Output*) An integrated AnnData dataset. |
+
+</div>
+
+## Component type: Metric
+
+A metric for evaluating batch integration methods.
+
+Arguments:
+
+<div class="small">
+
+| Name                 | Type   | Description                                    |
+|:---------------------|:-------|:-----------------------------------------------|
+| `--input_integrated` | `file` | An integrated AnnData dataset.                 |
+| `--input_solution`   | `file` | Uncensored dataset containing the true labels. |
+| `--output`           | `file` | (*Output*) Metric score file.                  |
+
+</div>
+
+## File format: Integrated Dataset
+
+An integrated AnnData dataset.
+
+Example file:
+`resources_test/batch_integration/cxg_mouse_pancreas_atlas/integrated.h5ad`
+
+Description:
+
+Must contain at least one of:
+
+- Feature: the corrected_counts layer
+- Embedding: the X_emb obsm
+- Graph: the connectivities and distances obsp
+
+Format:
+
+<div class="small">
+
+    AnnData object
+     obsm: 'X_emb'
+     obsp: 'connectivities', 'distances'
+     layers: 'corrected_counts'
+     uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'method_id', 'neighbors'
+
+</div>
+
+Data structure:
+
+<div class="small">
+
+| Slot | Type | Description |
+|:---|:---|:---|
+| `obsm["X_emb"]` | `double` | (*Optional*) Embedding output - 2D coordinate matrix. |
+| `obsp["connectivities"]` | `double` | (*Optional*) Graph output - neighbor connectivities matrix. |
+| `obsp["distances"]` | `double` | (*Optional*) Graph output - neighbor distances matrix. |
+| `layers["corrected_counts"]` | `double` | (*Optional*) Feature output - corrected counts. |
+| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
+| `uns["normalization_id"]` | `string` | Which normalization was used. |
+| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. |
+| `uns["method_id"]` | `string` | A unique identifier for the method. |
+| `uns["neighbors"]` | `object` | Supplementary K nearest neighbors data. |
+
+</div>
+
+## File format: Score
+
+Metric score file
+
+Example file: `score.h5ad`
+
+Format:
+
+<div class="small">
+
+    AnnData object
+     uns: 'dataset_id', 'normalization_id', 'method_id', 'metric_ids', 'metric_values'
+
+</div>
+
+Data structure:
+
+<div class="small">
+
+| Slot | Type | Description |
+|:---|:---|:---|
+| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
+| `uns["normalization_id"]` | `string` | Which normalization was used. |
+| `uns["method_id"]` | `string` | A unique identifier for the method. |
+| `uns["metric_ids"]` | `string` | One or more unique metric identifiers. |
+| `uns["metric_values"]` | `double` | The metric values obtained for the given prediction. Must be of same length as ‘metric_ids’. |
 
-Check out the [instructions](https://github.com/openproblems-bio/common_resources/blob/main/INSTRUCTIONS.md) for more information on how to update the example files and components. These instructions also contain information on how to build out the task and basic commands.
+</div>
 
-For more information on the OpenProblems v2, check out the [documentation](https://openproblems.bio/documentation/).
\ No newline at end of file
diff --git a/_viash.yaml b/_viash.yaml
index fe4c6257..8a0d18ea 100644
--- a/_viash.yaml
+++ b/_viash.yaml
@@ -1,81 +1,86 @@
 viash_version: 0.9.0
 
-# Step 1: Change the name of the task.
-# example: task_name_of_this_task
-name: task_template
+name: task_batch_integration
 organization: openproblems-bio
 version: dev
 
 license: MIT
-# Step 2: Add keywords to describe the task.
-keywords: [single-cell, openproblems, benchmark]
-# Step 3: Update the `task_template` to the name of the task from step 1.
+keywords: [ "batch integration", "scRNA-seq" ]
 links:
-  issue_tracker: https://github.com/openproblems-bio/task_template/issues
-  repository: https://github.com/openproblems-bio/task_template
+  issue_tracker: https://github.com/openproblems-bio/task_batch_integration/issues
+  repository: https://github.com/openproblems-bio/task_batch_integration
   docker_registry: ghcr.io
 
-
-# Step 4: Update the label, summary and description.
-# A unique, human-readable, short label. Used for creating summary tables and visualisations.
-label: Template
-summary: A one sentence summary of purpose and methodology. Used for creating an overview tables.
+label: Batch Integration
+summary: Remove unwanted batch effects from scRNA-seq data while retaining biologically meaningful variation.
 description: |
-  Provide a clear and concise description of your task, detailing the specific problem it aims
-  to solve. Outline the input data types, the expected output, and any assumptions or constraints.
-  Be sure to explain any terminology or concepts that are essential for understanding the task.
+  As single-cell technologies advance, single-cell datasets are growing both in size and complexity.
+  Especially in consortia such as the Human Cell Atlas, individual studies combine data from multiple labs, each sequencing multiple individuals possibly with different technologies.
+  This gives rise to complex batch effects in the data that must be computationally removed to perform a joint analysis.
+  These batch integration methods must remove the batch effect while not removing relevant biological information.
+  Currently, over 200 tools exist that aim to remove batch effects scRNA-seq datasets [@zappia2018exploring].
+  These methods balance the removal of batch effects with the conservation of nuanced biological information in different ways.
+  This abundance of tools has complicated batch integration method choice, leading to several benchmarks on this topic [@luecken2020benchmarking; @tran2020benchmark; @chazarragil2021flexible; @mereu2020benchmarking].
+  Yet, benchmarks use different metrics, method implementations and datasets. Here we build a living benchmarking task for batch integration methods with the vision of improving the consistency of method evaluation.
 
-  Explain the motivation behind your proposed task. Describe the biological or computational 
-  problem you aim to address and why it's important. Discuss the current state of research in
-  this area and any gaps or challenges that your task could help address. This section 
-  should convince readers of the significance and relevance of your task.
+  In this task we evaluate batch integration methods on their ability to remove batch effects in the data while conserving variation attributed to biological effects.
+  As input, methods require either normalised or unnormalised data with multiple batches and consistent cell type labels.
+  The batch integrated output can be a feature matrix, a low dimensional embedding and/or a neighbourhood graph.
+  The respective batch-integrated representation is then evaluated using sets of metrics that capture how well batch effects are removed and whether biological variance is conserved.
+  We have based this particular task on the latest, and most extensive benchmark of single-cell data integration methods.
 
-# A list of references to relevant literature. Each reference should be a DOI or a bibtex entry
 references:
   doi:
-    - 10.21203/rs.3.rs-4181617/v1
-  # bibtex:
-  #   - |
-  #     @article{doe_2021_template,
-  #       doi = {10.21203/rs.3.rs-4181617/v1},
-  #       url = {https://doi.org/10.21203/rs.3.rs-4181617/v1},
-  #       author = {Doe, John},
-  #       title = {A template for creating new tasks},
-  #       publisher = {Research Square},
-  #       year = {2021},
-  #     }
+    # Luecken, M.D., Büttner, M., Chaichoompu, K. et al. 
+    # Benchmarking atlas-level data integration in single-cell genomics. Nat Methods 19, 41–50 (2022). 
+    - 10.1038/s41592-021-01336-8
   
 info:
-  image: The name of the image file to use for the component on the website.
-  # Step 5: Replace the task_template to the name of the task.
+  image: thumbnail.svg
   test_resources:
     - type: s3
-      path: s3://openproblems-data/resources_test/task_template/
-      dest: resources_test/task_template
+      path: s3://openproblems-data/resources_test/common/cxg_mouse_pancreas_atlas/
+      dest: resources_test/common/cxg_mouse_pancreas_atlas
     - type: s3
-      path: s3://openproblems-data/resources_test/common/
-      dest: resources_test/common
+      path: s3://openproblems-data/resources_test/task_batch_integration/
+      dest: resources_test/task_batch_integration
 
-# Step 6: Update the authors of the task.
 authors: 
-  # Full name of the author, usually in the name of FirstName MiddleName LastName.
-  - name: John Doe
-    # Role of the author. Possible values:
-    # 
-    # * `"author"`: Authors who have made substantial contributions to the component.
-    # * `"maintainer"`: The maintainer of the component.
-    # * `"contributor"`: Authors who have made smaller contributions (such as code patches etc.).
-    roles: [ "author", "maintainer" ]
-    # Additional information on the author
-    info: 
-      github: johndoe
-      orcid: 0000-0000-0000-0000
-      email: john@doe.me
-      twitter: johndoe
-      linkedin: johndoe
-
-# Step 7: Remove all of the comments of the steps you completed
-# Step 8: High five yourself!
+  - name: Michaela Mueller
+    roles: [ maintainer, author ]
+    info:
+      github: mumichae
+      orcid: 0000-0002-1401-1785
+  - name: Malte Luecken
+    roles: [ author ]
+    info:
+      github: LuckyMD
+      orcid: 0000-0001-7464-7921
+  - name: Daniel Strobl
+    roles: [ author ]
+    info:
+      github: danielStrobl
+      orcid: 0000-0002-5516-7057
+  - name: Robrecht Cannoodt
+    roles: [ contributor ]
+    info:
+      github: rcannood
+      orcid: "0000-0003-3641-729X"
+  - name: "Scott Gigante"
+    roles: [ contributor ]
+    info:
+      github: scottgigante
+      orcid: "0000-0002-4544-2764"
+  - name: Kai Waldrant
+    roles: [ contributor ]
+    info:
+      github: KaiWaldrant
+      orcid: "0009-0003-8555-1361"
+  - name: Nartin Kim
+    roles: [ contributor ]
+    info:
+      github: martinkim0
+      orcid: "0009-0003-8555-1361"
 
 config_mods: |
   .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" }
diff --git a/common b/common
index bf64ebca..f2642835 160000
--- a/common
+++ b/common
@@ -1 +1 @@
-Subproject commit bf64ebcaef096f37013733351a08671f7caca896
+Subproject commit f2642835c89264e0a43e87e3f6c588c6be4902e7
diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh
index ccfd5feb..7222cc85 100755
--- a/scripts/create_resources/resources.sh
+++ b/scripts/create_resources/resources.sh
@@ -8,7 +8,7 @@ cd "$REPO_ROOT"
 
 # remove this when you have implemented the script
 echo "TODO: once the 'process_datasets' workflow is implemented, update this script to use it."
-echo "  Step 1: replace 'task_template' with the name of the task in the following command."
+echo "  Step 1: replace 'task_batch_integration' with the name of the task in the following command."
 echo "  Step 2: replace the rename keys parameters to fit your process_dataset inputs"
 echo "  Step 3: replace the settings parameter to fit your process_dataset outputs"
 echo "  Step 4: remove this message"
@@ -19,10 +19,10 @@ input_states: s3://openproblems-data/resources/datasets/**/state.yaml
 rename_keys: 'input:output_dataset'
 output_state: '$id/state.yaml'
 settings: '{"output_train": "$id/train.h5ad", "output_test": "$id/test.h5ad"}'
-publish_dir: s3://openproblems-data/resources/task_template/datasets/
+publish_dir: s3://openproblems-data/resources/task_batch_integration/datasets/
 HERE
 
-tw launch https://github.com/openproblems-bio/task_template.git \
+tw launch https://github.com/openproblems-bio/task_batch_integration.git \
   --revision build/main \
   --pull-latest \
   --main-script target/nextflow/workflows/process_datasets/main.nf \
@@ -31,4 +31,4 @@ tw launch https://github.com/openproblems-bio/task_template.git \
   --params-file /tmp/params.yaml \
   --entry-name auto \
   --config common/nextflow_helpers/labels_tw.config \
-  --labels task_template,process_datasets
+  --labels task_batch_integration,process_datasets
diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
index 2b3378bb..06cc4486 100755
--- a/scripts/create_resources/test_resources.sh
+++ b/scripts/create_resources/test_resources.sh
@@ -14,7 +14,7 @@ exit 1
 set -e
 
 RAW_DATA=resources_test/common
-DATASET_DIR=resources_test/task_template
+DATASET_DIR=resources_test/task_batch_integration
 
 mkdir -p $DATASET_DIR
 
@@ -25,7 +25,7 @@ nextflow run . \
   -profile docker \
   --publish_dir "$DATASET_DIR" \
   --id "pancreas" \
-  --input "$RAW_DATA/pancreas/dataset.h5ad" \
+  --input "$RAW_DATA/cxg_mouse_pancreas_atlas/dataset.h5ad" \
   --output_train '$id/train.h5ad' \
   --output_test '$id/test.h5ad' \
   --output_solution '$id/solution.h5ad' \
@@ -33,17 +33,17 @@ nextflow run . \
 
 # run one method
 viash run src/methods/knn/config.vsh.yaml -- \
-    --input_train $DATASET_DIR/pancreas/train.h5ad \
-    --input_test $DATASET_DIR/pancreas/test.h5ad \
-    --output $DATASET_DIR/pancreas/prediction.h5ad
+    --input_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \
+    --input_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \
+    --output $DATASET_DIR/cxg_mouse_pancreas_atlas/prediction.h5ad
 
 # run one metric
 viash run src/metrics/accuracy/config.vsh.yaml -- \
-    --input_prediction $DATASET_DIR/pancreas/prediction.h5ad \
-    --input_solution $DATASET_DIR/pancreas/solution.h5ad \
-    --output $DATASET_DIR/pancreas/score.h5ad
+    --input_prediction $DATASET_DIR/cxg_mouse_pancreas_atlas/prediction.h5ad \
+    --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \
+    --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad
 
 # only run this if you have access to the openproblems-data bucket
 aws s3 sync --profile op \
-  "$DATASET_DIR" s3://openproblems-data/resources_test/task_template \
+  "$DATASET_DIR" s3://openproblems-data/resources_test/task_batch_integration \
   --delete --dryrun
diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh
index 8c63393b..a814e273 100755
--- a/scripts/run_benchmark/run_full_local.sh
+++ b/scripts/run_benchmark/run_full_local.sh
@@ -13,7 +13,7 @@ cd "$REPO_ROOT"
 
 # remove this when you have implemented the script
 echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it."
-echo "  Step 1: replace 'task_template' with the name of the task in the following command."
+echo "  Step 1: replace 'task_batch_integration' with the name of the task in the following command."
 echo "  Step 2: replace the rename keys parameters to fit your run_benchmark inputs"
 echo "  Step 3: replace the settings parameter to fit your run_benchmark outputs"
 echo "  Step 4: remove this message"
@@ -37,7 +37,7 @@ publish_dir: "$publish_dir"
 HERE
 
 # run the benchmark
-nextflow run openproblems-bio/task_template \
+nextflow run openproblems-bio/task_batch_integration \
   --revision build/main \
   -main-script target/nextflow/workflows/run_benchmark/main.nf \
   -profile docker \
diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh
index 87d133c4..30da5689 100755
--- a/scripts/run_benchmark/run_full_seqeracloud.sh
+++ b/scripts/run_benchmark/run_full_seqeracloud.sh
@@ -8,7 +8,7 @@ cd "$REPO_ROOT"
 
 # remove this when you have implemented the script
 echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it."
-echo "  Step 1: replace 'task_template' with the name of the task in the following command."
+echo "  Step 1: replace 'task_batch_integration' with the name of the task in the following command."
 echo "  Step 2: replace the rename keys parameters to fit your run_benchmark inputs"
 echo "  Step 3: replace the settings parameter to fit your run_benchmark outputs"
 echo "  Step 4: remove this message"
@@ -18,17 +18,17 @@ set -e
 
 # generate a unique id
 RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
-publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}"
+publish_dir="s3://openproblems-data/resources/task_batch_integration/results/${RUN_ID}"
 
 # write the parameters to file
 cat > /tmp/params.yaml << HERE
-input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml
+input_states: s3://openproblems-data/resources/task_batch_integration/datasets/**/state.yaml
 rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
 HERE
 
-tw launch https://github.com/openproblems-bio/task_template.git \
+tw launch https://github.com/openproblems-bio/task_batch_integration.git \
   --revision build/main \
   --pull-latest \
   --main-script target/nextflow/workflows/run_benchmark/main.nf \
@@ -37,4 +37,4 @@ tw launch https://github.com/openproblems-bio/task_template.git \
   --params-file /tmp/params.yaml \
   --entry-name auto \
   --config common/nextflow_helpers/labels_tw.config \
-  --labels task_template,full
\ No newline at end of file
+  --labels task_batch_integration,full
\ No newline at end of file
diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh
index e5496358..4325e95e 100755
--- a/scripts/run_benchmark/run_test_local.sh
+++ b/scripts/run_benchmark/run_test_local.sh
@@ -8,7 +8,7 @@ cd "$REPO_ROOT"
 
 # remove this when you have implemented the script
 echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it."
-echo "  Step 1: replace 'task_template' with the name of the task in the following command."
+echo "  Step 1: replace 'task_batch_integration' with the name of the task in the following command."
 echo "  Step 2: replace the rename keys parameters to fit your run_benchmark inputs"
 echo "  Step 3: replace the settings parameter to fit your run_benchmark outputs"
 echo "  Step 4: remove this message"
@@ -25,7 +25,7 @@ publish_dir="resources/results/${RUN_ID}"
 
 # write the parameters to file
 cat > /tmp/params.yaml << HERE
-input_states: s3://openproblems-data/resources_test/task_template/**/state.yaml
+input_states: s3://openproblems-data/resources_test/task_batch_integration/**/state.yaml
 rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution'
 output_state: "state.yaml"
 publish_dir: "$publish_dir"
diff --git a/scripts/run_benchmark/run_test_seqeracloud.sh b/scripts/run_benchmark/run_test_seqeracloud.sh
index ac910205..e8afb9e2 100755
--- a/scripts/run_benchmark/run_test_seqeracloud.sh
+++ b/scripts/run_benchmark/run_test_seqeracloud.sh
@@ -8,7 +8,7 @@ cd "$REPO_ROOT"
 
 # remove this when you have implemented the script
 echo "TODO: once the 'run_benchmark' workflow has been implemented, update this script to use it."
-echo "  Step 1: replace 'task_template' with the name of the task in the following command."
+echo "  Step 1: replace 'task_batch_integration' with the name of the task in the following command."
 echo "  Step 2: replace the rename keys parameters to fit your run_benchmark inputs"
 echo "  Step 3: replace the settings parameter to fit your run_benchmark outputs"
 echo "  Step 4: remove this message"
@@ -18,13 +18,13 @@ set -e
 
 # write the parameters to file
 cat > /tmp/params.yaml << 'HERE'
-input_states: s3://openproblems-data/resources_test/task_template/**/state.yaml
+input_states: s3://openproblems-data/resources_test/task_batch_integration/**/state.yaml
 rename_keys: 'input_train:output_train;input_test:output_test;input_solution:output_solution'
 output_state: "state.yaml"
-publish_dir: s3://openproblems-nextflow/temp/task_template/
+publish_dir: s3://openproblems-nextflow/temp/task_batch_integration/
 HERE
 
-tw launch https://github.com/openproblems-bio/task_template.git \
+tw launch https://github.com/openproblems-bio/task_batch_integration.git \
   --revision build/main \
   --pull-latest \
   --main-script target/nextflow/workflows/run_benchmark/main.nf \
@@ -33,4 +33,4 @@ tw launch https://github.com/openproblems-bio/task_template.git \
   --params-file /tmp/params.yaml \
   --entry-name auto \
   --config common/nextflow_helpers/labels_tw.config \
-  --labels task_template,test
+  --labels task_batch_integration,test
diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml
index 4d767d8e..f3f44352 100644
--- a/src/api/comp_control_method.yaml
+++ b/src/api/comp_control_method.yaml
@@ -2,36 +2,27 @@ namespace: control_methods
 info:
   type: control_method
   type_info:
-    label: Control Method
-    summary: Quality control methods for verifying the pipeline.
+    label: Control method
+    summary: A control method for the batch integration task.
     description: |
-      This folder contains control components for the task. 
-      These components have the same interface as the regular methods
-      but also receive the solution object as input. It serves as a
-      starting point to test the relative accuracy of new methods in
-      the task, and also as a quality control for the metrics defined
-      in the task.
+      A control method for the batch integration task.
 arguments:
-  - name: --input_train
-    __merge__: file_train_h5ad.yaml
-    required: true
+  - name: --input_dataset
+    __merge__: file_dataset.yaml
     direction: input
-  - name: --input_test
-    __merge__: file_test_h5ad.yaml
     required: true
-    direction: input
-  - name: "--input_solution"
+  - name: --input_solution
     __merge__: file_solution.yaml
     direction: input
     required: true
   - name: --output
-    __merge__: file_prediction.yaml
-    required: true
     direction: output
+    __merge__: file_integrated.yaml
+    required: true
 test_resources:
-  - type: python_script
-    path: /common/component_tests/run_and_check_output.py
   - type: python_script
     path: /common/component_tests/check_config.py
-  - path: /resources_test/task_template/pancreas
-    dest: resources_test/task_template/pancreas
\ No newline at end of file
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - path: /resources_test/batch_integration/cxg_mouse_pancreas_atlas
+    dest: resources_test/batch_integration/cxg_mouse_pancreas_atlas
diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml
deleted file mode 100644
index 184bc548..00000000
--- a/src/api/comp_data_processor.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-namespace: "data_processors"
-info:
-  type: data_processor
-  type_info:
-    label: Data processor
-    summary: A data processor.
-    description: |
-      A component for processing a Common Dataset into a task-specific dataset.
-arguments:
-  - name: "--input"
-    __merge__: file_common_dataset.yaml
-    direction: input
-    required: true
-  - name: "--output_train"
-    __merge__: file_train_h5ad.yaml
-    direction: output
-    required: true
-  - name: "--output_test"
-    __merge__: file_test_h5ad.yaml
-    direction: output
-    required: true
-  - name: "--output_solution"
-    __merge__: file_solution.yaml
-    direction: output
-    required: true
-test_resources:
-  - path: /resources_test/common/pancreas
-    dest: resources_test/common/pancreas
-  - type: python_script
-    path: /common/component_tests/run_and_check_output.py
-      
diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml
index d7be9578..22972ce7 100644
--- a/src/api/comp_method.yaml
+++ b/src/api/comp_method.yaml
@@ -1,28 +1,24 @@
-namespace: "methods"
+namespace: methods
 info:
   type: method
   type_info:
     label: Method
-    summary: A method.
+    summary: A method for the batch integration task.
     description: |
-      A method to predict the task effects.
+      A batch integration method which integrates multiple datasets.
 arguments:
-  - name: --input_train
-    __merge__: file_train_h5ad.yaml
-    required: true
-    direction: input
-  - name: "--input_test"
-    __merge__: file_test_h5ad.yaml
+  - name: --input
+    __merge__: file_dataset.yaml
     direction: input
     required: true
   - name: --output
-    __merge__: file_prediction.yaml
-    required: true
+    __merge__: file_integrated.yaml
     direction: output
+    required: true
 test_resources:
-  - type: python_script
-    path: /common/component_tests/run_and_check_output.py
   - type: python_script
     path: /common/component_tests/check_config.py
-  - path: /resources_test/task_template/pancreas
-    dest: resources_test/task_template/pancreas
\ No newline at end of file
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - path: /resources_test/batch_integration/cxg_mouse_pancreas_atlas
+    dest: resources_test/batch_integration/cxg_mouse_pancreas_atlas
diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
index e3295da0..844e9c9c 100644
--- a/src/api/comp_metric.yaml
+++ b/src/api/comp_metric.yaml
@@ -1,28 +1,28 @@
-namespace: "metrics"
+namespace: metrics
 info:
   type: metric
   type_info:
     label: Metric
-    summary: A task template metric.
+    summary: A metric for evaluating batch integration methods.
     description: |
-      A metric for evaluating method predictions.
+      A metric for evaluating batch integration methods.
 arguments:
-  - name: "--input_solution"
-    __merge__: file_solution.yaml
+  - name: --input_integrated
+    __merge__: file_integrated.yaml
     direction: input
     required: true
-  - name: "--input_prediction"
-    __merge__: file_prediction.yaml
+  - name: --input_solution
+    __merge__: file_solution.yaml
     direction: input
     required: true
-  - name: "--output"
+  - name: --output
     __merge__: file_score.yaml
     direction: output
     required: true
 test_resources:
-  - type: python_script
-    path: /common/component_tests/run_and_check_output.py
   - type: python_script
     path: /common/component_tests/check_config.py
-  - path: /resources_test/task_template/pancreas
-    dest: resources_test/task_template/pancreas
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - path: /resources_test/batch_integration/cxg_mouse_pancreas_atlas
+    dest: resources_test/batch_integration/cxg_mouse_pancreas_atlas
diff --git a/src/api/comp_process_dataset.yaml b/src/api/comp_process_dataset.yaml
new file mode 100644
index 00000000..a6b33479
--- /dev/null
+++ b/src/api/comp_process_dataset.yaml
@@ -0,0 +1,43 @@
+info:
+  type: process_dataset
+  type_info:
+    label: Data processor
+    summary: A label projection dataset processor.
+    description: |
+      A component for processing a Common Dataset into a task-specific dataset.
+arguments:
+  - name: "--input"
+    __merge__: file_common_dataset.yaml
+    direction: input
+    required: true
+  - name: "--output_dataset"
+    __merge__: file_dataset.yaml
+    direction: output
+    required: true
+  - name: "--output_solution"
+    __merge__: file_solution.yaml
+    direction: output
+    required: true
+  - name: "--obs_label"
+    type: "string"
+    description: "Which .obs slot to use as label."
+    default: "cell_type"
+  - name: "--obs_batch"
+    type: "string"
+    description: "Which .obs slot to use as batch covariate."
+    default: "batch"
+  - name: --hvgs
+    type: integer
+    description: Number of highly variable genes
+    default: 2000
+    required: false
+  - name: --subset_hvg
+    type: boolean
+    description: Whether to subset to highly variable genes
+    default: false
+    required: false
+test_resources:
+  - path: /resources_test/common/cxg_mouse_pancreas_atlas/
+    dest: resources_test/common/cxg_mouse_pancreas_atlas/
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
\ No newline at end of file
diff --git a/src/api/file_common_dataset.yaml b/src/api/file_common_dataset.yaml
index 0927ea0a..1399f0b2 100644
--- a/src/api/file_common_dataset.yaml
+++ b/src/api/file_common_dataset.yaml
@@ -1,5 +1,8 @@
+# This file is based on the spec of the common dataset located at
+# `src/datasets/api/file_common_dataset.yaml`. However, some fields
+# such as obs.cell_type and obs.batch are now required
 type: file
-example: "resources_test/common/pancreas/dataset.h5ad"
+example: "resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad"
 label: "Common Dataset"
 summary: A subset of the common dataset.
 info:
@@ -32,11 +35,24 @@ info:
         name: hvg_score
         description: A ranking of the features by hvg.
         required: true
+      - type: string
+        name: feature_name
+        description: A human-readable name for the feature, usually a gene symbol.
+        required: true
     obsm:
       - type: double
         name: X_pca
         description: The resulting PCA embedding.
         required: true
+    obsp:
+      - type: double
+        name: knn_distances
+        description: K nearest neighbors distance matrix.
+        required: true
+      - type: double
+        name: knn_connectivities
+        description: K nearest neighbors connectivities matrix.
+        required: true
     uns:
       - type: string
         name: dataset_id
@@ -70,3 +86,7 @@ info:
         name: normalization_id
         description: "Which normalization was used"
         required: true
+      - type: object
+        name: knn
+        description: Supplementary K nearest neighbors data.
+        required: false
diff --git a/src/api/file_train_h5ad.yaml b/src/api/file_dataset.yaml
similarity index 51%
rename from src/api/file_train_h5ad.yaml
rename to src/api/file_dataset.yaml
index 7d2b51d5..f838b2ef 100644
--- a/src/api/file_train_h5ad.yaml
+++ b/src/api/file_dataset.yaml
@@ -1,29 +1,28 @@
-#TODO: Change to the required and/or optional fields of the anndata
 type: file
-example: "resources_test/task_template/pancreas/train.h5ad"
-label: "Training data"
-summary: "The training data in h5ad format"
+example: "resources_test/batch_integration/cxg_mouse_pancreas_atlas/dataset.h5ad"
+label: "Dataset"
+summary: Unintegrated AnnData HDF5 file.
 info:
   format:
     type: h5ad
-    layers: 
+    layers:
       - type: integer
         name: counts
         description: Raw counts
         required: true
       - type: double
         name: normalized
-        description: Normalized counts
+        description: Normalized expression values
         required: true
     obs:
-      - type: string
-        name: label
-        description: Ground truth cell type labels
-        required: true
       - type: string
         name: batch
         description: Batch information
         required: true
+      - type: string
+        name: label
+        description: label information
+        required: true
     var:
       - type: boolean
         name: hvg
@@ -33,11 +32,24 @@ info:
         name: hvg_score
         description: A ranking of the features by hvg.
         required: true
+      - type: string
+        name: feature_name
+        description: A human-readable name for the feature, usually a gene symbol.
+        required: true
     obsm:
       - type: double
         name: X_pca
         description: The resulting PCA embedding.
         required: true
+    obsp:
+      - type: double
+        name: knn_distances
+        description: K nearest neighbors distance matrix.
+        required: true
+      - type: double
+        name: knn_connectivities
+        description: K nearest neighbors connectivities matrix.
+        required: true
     uns:
       - type: string
         name: dataset_id
@@ -46,4 +58,12 @@ info:
       - type: string
         name: normalization_id
         description: "Which normalization was used"
-        required: true
\ No newline at end of file
+        required: true
+      - name: dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+      - type: object
+        name: knn
+        description: Supplementary K nearest neighbors data.
+        required: true
diff --git a/src/api/file_integrated.yaml b/src/api/file_integrated.yaml
new file mode 100644
index 00000000..1b6d6873
--- /dev/null
+++ b/src/api/file_integrated.yaml
@@ -0,0 +1,53 @@
+type: file
+example: "resources_test/batch_integration/cxg_mouse_pancreas_atlas/integrated.h5ad"
+label: "Integrated Dataset"
+summary: An integrated AnnData dataset.
+description: |
+  Must contain at least one of:
+
+    - Feature: the corrected_counts layer
+    - Embedding: the X_emb obsm
+    - Graph: the connectivities and distances obsp
+info:
+  format:
+    type: h5ad
+    layers:
+      - type: double
+        name: corrected_counts
+        description: Feature output - corrected counts
+        required: false
+    obsm:
+      - type: double
+        name: X_emb
+        description: Embedding output - 2D coordinate matrix
+        required: false
+    obsp:
+      - type: double
+        name: connectivities
+        description: "Graph output - neighbor connectivities matrix"
+        required: false
+      - type: double
+        name: distances
+        description: "Graph output - neighbor distances matrix"
+        required: false
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: normalization_id
+        description: "Which normalization was used"
+        required: true
+      - name: dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
+      - type: object
+        name: neighbors
+        description: Supplementary K nearest neighbors data.
+        required: true
diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml
deleted file mode 100644
index 4a6dc328..00000000
--- a/src/api/file_prediction.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-#TODO: Change to the required and/or optional fields of the anndata
-type: file
-example: "resources_test/task_template/pancreas/prediction.h5ad"
-label: "Predicted data"
-summary: A predicted dataset as output by a method.
-info:
-  format:
-    type: h5ad
-    obs:
-      - type: string
-        name: label_pred
-        description: Predicted labels for the test cells.
-        required: true
-    uns:
-      - type: string
-        name: dataset_id
-        description: "A unique identifier for the dataset"
-        required: true
-      - type: string
-        name: normalization_id
-        description: "Which normalization was used"
-        required: true
-      - type: string
-        name: method_id
-        description: "A unique identifier for the method"
-        required: true
\ No newline at end of file
diff --git a/src/api/file_score.yaml b/src/api/file_score.yaml
index f6022a83..9c0ebf0e 100644
--- a/src/api/file_score.yaml
+++ b/src/api/file_score.yaml
@@ -1,8 +1,7 @@
-#TODO: Change to the required and/or optional fields of the anndata
 type: file
-example: resources/score.h5ad
-label: Score
-summary: "File indicating the score of a metric."
+example: "score.h5ad"
+label: "Score"
+summary: "Metric score file"
 info:
   format:
     type: h5ad
@@ -23,9 +22,9 @@ info:
         name: metric_ids
         description: "One or more unique metric identifiers"
         multiple: true
-        required: true
+        required: true        
       - type: double
         name: metric_values
         description: "The metric values obtained for the given prediction. Must be of same length as 'metric_ids'."
         multiple: true
-        required: true
\ No newline at end of file
+        required: true
diff --git a/src/api/file_solution.yaml b/src/api/file_solution.yaml
index 81e168e9..a15b902d 100644
--- a/src/api/file_solution.yaml
+++ b/src/api/file_solution.yaml
@@ -1,29 +1,28 @@
-#TODO: Change to the required and/or optional fields of the anndata
 type: file
-example: "resources_test/task_template/pancreas/solution.h5ad"
+example: "resources_test/batch_integration/cxg_mouse_pancreas_atlas/solution.h5ad"
 label: "Solution"
-summary: "The solution for the test data"
+summary: Uncensored dataset containing the true labels.
 info:
   format:
     type: h5ad
-    layers: 
+    layers:
       - type: integer
         name: counts
         description: Raw counts
         required: true
       - type: double
         name: normalized
-        description: Normalized counts
+        description: Normalized expression values
         required: true
     obs:
-      - type: string
-        name: label
-        description: Ground truth cell type labels
-        required: true
       - type: string
         name: batch
         description: Batch information
         required: true
+      - type: string
+        name: label
+        description: label information
+        required: true
     var:
       - type: boolean
         name: hvg
@@ -33,11 +32,24 @@ info:
         name: hvg_score
         description: A ranking of the features by hvg.
         required: true
+      - type: string
+        name: feature_name
+        description: A human-readable name for the feature, usually a gene symbol.
+        required: true
     obsm:
       - type: double
         name: X_pca
         description: The resulting PCA embedding.
         required: true
+    obsp:
+      - type: double
+        name: knn_distances
+        description: K nearest neighbors distance matrix.
+        required: true
+      - type: double
+        name: knn_connectivities
+        description: K nearest neighbors connectivities matrix.
+        required: true
     uns:
       - type: string
         name: dataset_id
@@ -71,3 +83,7 @@ info:
         name: normalization_id
         description: "Which normalization was used"
         required: true
+      - type: object
+        name: knn
+        description: Supplementary K nearest neighbors data.
+        required: true
diff --git a/src/api/file_test_h5ad.yaml b/src/api/file_test_h5ad.yaml
deleted file mode 100644
index 6ee21ac5..00000000
--- a/src/api/file_test_h5ad.yaml
+++ /dev/null
@@ -1,45 +0,0 @@
-#TODO: Change to the required and/or optional fields of the anndata
-type: file
-example: "resources_test/task_template/pancreas/test.h5ad"
-label: "Test data"
-summary: The subset of molecules used for the test dataset
-info:
-  format:
-    type: h5ad
-    layers: 
-      - type: integer
-        name: counts
-        description: Raw counts
-        required: true
-      - type: double
-        name: normalized
-        description: Normalized counts
-        required: true
-    obs:
-      - type: string
-        name: batch
-        description: Batch information
-        required: true
-    var:
-      - type: boolean
-        name: hvg
-        description: Whether or not the feature is considered to be a 'highly variable gene'
-        required: true
-      - type: double
-        name: hvg_score
-        description: A ranking of the features by hvg.
-        required: true
-    obsm:
-      - type: double
-        name: X_pca
-        description: The resulting PCA embedding.
-        required: true
-    uns:
-      - type: string
-        name: dataset_id
-        description: "A unique identifier for the dataset"
-        required: true
-      - type: string
-        name: normalization_id
-        description: "Which normalization was used"
-        required: true
\ No newline at end of file
diff --git a/thumbnail.svg b/thumbnail.svg
new file mode 100644
index 00000000..77626c5b
--- /dev/null
+++ b/thumbnail.svg
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="UTF-8"?><svg id="Layer_1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 600 200"><defs><style>.cls-1{stroke:#211f1f;stroke-width:3px;}.cls-1,.cls-2,.cls-3,.cls-4,.cls-5,.cls-6,.cls-7,.cls-8,.cls-9{stroke-miterlimit:10;}.cls-1,.cls-3{fill:none;}.cls-10{fill:#211f1f;}.cls-2{fill:#9b7ebd;stroke:#1b1464;}.cls-2,.cls-4,.cls-5,.cls-6,.cls-7,.cls-8,.cls-9{opacity:.9;}.cls-2,.cls-4,.cls-5,.cls-6,.cls-8,.cls-9{stroke-width:.5px;}.cls-11{font-family:ArialMT, Arial;font-size:16px;}.cls-3{stroke:#231f20;stroke-width:2px;}.cls-4{fill:#b7d59b;}.cls-4,.cls-9{stroke:#006837;}.cls-5{fill:#fba29a;stroke:#ff1d25;}.cls-6{fill:#fcd375;stroke:#f7931e;}.cls-7{fill:#ccc;stroke:gray;stroke-width:.75px;}.cls-8{fill:#69aee9;stroke:#0071bc;}.cls-9{fill:#00a99d;}</style></defs><g><g><ellipse class="cls-6" cx="383.1" cy="93.49" rx="5.5" ry="5.48" transform="translate(-1.3 5.51) rotate(-.82)"/><ellipse class="cls-6" cx="396.94" cy="65.36" rx="5.5" ry="5.48" transform="translate(-.9 5.71) rotate(-.82)"/><ellipse class="cls-6" cx="382.92" cy="80.54" rx="5.48" ry="5.49" transform="translate(191.24 421.67) rotate(-72.55)"/><ellipse class="cls-6" cx="397.44" cy="80.54" rx="5.48" ry="5.49" transform="translate(201.4 435.53) rotate(-72.55)"/><ellipse class="cls-6" cx="382.68" cy="63.79" rx="5.39" ry="5.45" transform="translate(285.33 439.34) rotate(-84.93)"/><ellipse class="cls-2" cx="475.67" cy="35.33" rx="5.48" ry="5.49"/><ellipse class="cls-2" cx="426.43" cy="48.87" rx="5.48" ry="5.49"/><ellipse class="cls-2" cx="440.18" cy="44.94" rx="5.48" ry="5.49"/><ellipse class="cls-2" cx="448.47" cy="34.06" rx="5.48" ry="5.49"/><ellipse class="cls-8" cx="501.2" cy="50.04" rx="5.49" ry="5.48" transform="translate(292.48 508.45) rotate(-71.22)"/><ellipse class="cls-8" cx="524.55" cy="36.76" rx="5.48" ry="5.49"/><ellipse class="cls-8" cx="500.5" cy="20.39" rx="5.48" ry="5.49"/><ellipse class="cls-8" cx="484.65" cy="26.01" rx="5.48" ry="5.49"/><ellipse class="cls-8" cx="496.9" cy="35.98" rx="5.49" ry="5.48" transform="translate(302.87 494.85) rotate(-71.22)"/><ellipse class="cls-8" cx="527.48" cy="23.89" rx="5.49" ry="5.48" transform="translate(335.06 515.6) rotate(-71.22)"/><ellipse class="cls-8" cx="488.26" cy="45.64" rx="5.49" ry="5.48" transform="translate(287.87 493.22) rotate(-71.22)"/><rect class="cls-2" x="452.7" y="41.41" width="10.43" height="10.99"/><rect class="cls-8" x="508.5" y="62.73" width="10.43" height="10.99"/><rect class="cls-6" x="391.32" y="88.07" width="10.43" height="10.99"/><rect class="cls-6" x="392.23" y="44.67" width="10.43" height="10.99"/><rect class="cls-6" x="399.03" y="101.63" width="10.43" height="10.99"/><rect class="cls-2" x="417.6" y="57.23" width="10.43" height="10.99"/><rect class="cls-2" x="457.47" y="25.28" width="10.43" height="10.99"/><rect class="cls-2" x="439.86" y="51.94" width="10.43" height="10.99"/><rect class="cls-8" x="532.93" y="34.33" width="10.43" height="10.99"/><rect class="cls-8" x="509.77" y="16.26" width="10.43" height="10.99"/><rect class="cls-8" x="512.99" y="46.44" width="10.43" height="10.99"/><rect class="cls-8" x="505.73" y="31.27" width="10.43" height="10.99"/><ellipse class="cls-5" cx="551.74" cy="119.72" rx="5.48" ry="5.49" transform="translate(148.82 515.21) rotate(-56.71)"/><ellipse class="cls-5" cx="524.56" cy="123.01" rx="5.48" ry="5.49" transform="translate(133.82 493.97) rotate(-56.71)"/><ellipse class="cls-5" cx="542.21" cy="104.98" rx="5.39" ry="5.45" transform="translate(358.43 625.78) rotate(-81.52)"/><rect class="cls-5" x="520.74" y="104.04" width="10.43" height="10.99"/><rect class="cls-5" x="523.19" y="91.36" width="10.43" height="10.99"/><rect class="cls-5" x="533.49" y="112.02" width="10.43" height="10.99"/><ellipse class="cls-4" cx="502.3" cy="122.25" rx="5.49" ry="5.48" transform="translate(-4.22 225.62) rotate(-25.21)"/><ellipse class="cls-4" cx="471.98" cy="160.26" rx="5.49" ry="5.48" transform="translate(-23.3 216.32) rotate(-25.21)"/><ellipse class="cls-4" cx="458.43" cy="158" rx="5.49" ry="5.48" transform="translate(-23.63 210.33) rotate(-25.21)"/><ellipse class="cls-9" cx="486.04" cy="127.73" rx="5.49" ry="5.48" transform="translate(-8.11 219.21) rotate(-25.21)"/><ellipse class="cls-9" cx="469.48" cy="101.62" rx="5.49" ry="5.48" transform="translate(1.44 209.67) rotate(-25.21)"/><ellipse class="cls-9" cx="459.05" cy="127.73" rx="5.49" ry="5.48" transform="translate(-10.68 207.72) rotate(-25.21)"/><ellipse class="cls-9" cx="477.19" cy="115.71" rx="5.49" ry="5.48" transform="translate(-3.83 214.3) rotate(-25.21)"/><ellipse class="cls-4" cx="510.87" cy="137.65" rx="5.49" ry="5.48" transform="translate(-9.96 230.74) rotate(-25.21)"/><rect class="cls-4" x="491.88" y="139.34" width="10.43" height="10.99"/><rect class="cls-4" x="479.48" y="161.55" width="10.43" height="10.99"/><rect class="cls-4" x="479.48" y="146.77" width="10.43" height="10.99"/><rect class="cls-4" x="466.76" y="168.62" width="10.43" height="10.99"/><rect class="cls-4" x="482.63" y="174.11" width="10.43" height="10.99"/><rect class="cls-4" x="465.9" y="142.1" width="10.43" height="10.99"/><rect class="cls-4" x="453.21" y="135.24" width="10.43" height="10.99"/><rect class="cls-4" x="504.27" y="144.84" width="10.43" height="10.99"/><rect class="cls-9" x="477.42" y="93" width="10.43" height="10.99"/><rect class="cls-9" x="466.49" y="125.89" width="10.43" height="10.99"/><rect class="cls-9" x="484.91" y="107.62" width="10.43" height="10.99"/><rect class="cls-9" x="457.37" y="109.18" width="10.43" height="10.99"/></g><g><ellipse class="cls-6" cx="149.5" cy="179.94" rx="5.5" ry="5.48" transform="translate(-2.57 2.16) rotate(-.82)"/><ellipse class="cls-6" cx="149.32" cy="166.99" rx="5.48" ry="5.49" transform="translate(-54.77 259.35) rotate(-72.55)"/><ellipse class="cls-6" cx="163.84" cy="166.99" rx="5.48" ry="5.49" transform="translate(-44.6 273.2) rotate(-72.55)"/><ellipse class="cls-2" cx="159.35" cy="105.01" rx="5.48" ry="5.49"/><ellipse class="cls-8" cx="162.31" cy="69.54" rx="5.49" ry="5.48" transform="translate(44.23 200.83) rotate(-71.22)"/><ellipse class="cls-5" cx="172.01" cy="46.03" rx="5.48" ry="5.49" transform="translate(39.12 164.55) rotate(-56.71)"/><ellipse class="cls-5" cx="172.69" cy="60.61" rx="5.39" ry="5.45" transform="translate(87.28 222.48) rotate(-81.52)"/><ellipse class="cls-6" cx="156.95" cy="153.94" rx="5.5" ry="5.48" transform="translate(-2.19 2.27) rotate(-.82)"/><rect class="cls-8" x="30.85" y="95.89" width="10.43" height="10.99"/><rect class="cls-2" x="39.41" y="51.35" width="10.43" height="10.99"/><rect class="cls-5" x="88.84" y="109.89" width="10.99" height="10.43"/><rect class="cls-4" x="95.07" y="95.84" width="10.99" height="10.43"/><rect class="cls-9" x="96.9" y="59.84" width="10.99" height="10.43"/><rect class="cls-6" x="49.51" y="23.66" width="10.43" height="10.99"/><rect class="cls-6" x="41.38" y="38.36" width="10.43" height="10.99"/><rect class="cls-2" x="55.51" y="60.85" width="10.43" height="10.99"/><rect class="cls-5" x="72.11" y="108.07" width="10.99" height="10.43"/><rect class="cls-8" x="44.07" y="104.45" width="10.43" height="10.99"/><rect class="cls-2" x="37.22" y="81.39" width="10.43" height="10.99"/><rect class="cls-5" x="49.45" y="120.58" width="10.99" height="10.43"/><rect class="cls-4" x="77.71" y="82.31" width="10.99" height="10.43"/><rect class="cls-9" x="92.04" y="72.21" width="10.99" height="10.43"/><rect class="cls-6" x="63.3" y="14.58" width="10.43" height="10.99"/><rect class="cls-6" x="36.17" y="25.2" width="10.43" height="10.99"/><rect class="cls-2" x="41.26" y="64.83" width="10.43" height="10.99"/><rect class="cls-5" x="57.74" y="107.49" width="10.99" height="10.43"/><rect class="cls-8" x="27.41" y="109.95" width="10.43" height="10.99"/><rect class="cls-2" x="57" y="42.47" width="10.43" height="10.99"/><rect class="cls-4" x="79.92" y="95.58" width="10.99" height="10.43"/><rect class="cls-9" x="104.22" y="82.31" width="10.99" height="10.43"/><rect class="cls-6" x="71.09" y="42.47" width="10.43" height="10.99"/><rect class="cls-6" x="64.1" y="29.15" width="10.43" height="10.99"/><rect class="cls-2" x="28.55" y="68.7" width="10.43" height="10.99"/><rect class="cls-5" x="62.98" y="120.81" width="10.99" height="10.43"/><rect class="cls-9" x="79.92" y="60.66" width="10.99" height="10.43"/><ellipse class="cls-8" cx="157.44" cy="83.87" rx="5.49" ry="5.48" transform="translate(27.36 205.93) rotate(-71.22)"/><ellipse class="cls-8" cx="148.33" cy="66.22" rx="5.49" ry="5.48" transform="translate(37.88 185.34) rotate(-71.22)"/><ellipse class="cls-2" cx="171.68" cy="114.64" rx="5.48" ry="5.49"/><ellipse class="cls-2" cx="169.71" cy="130.85" rx="5.48" ry="5.49"/><ellipse class="cls-2" cx="156.95" cy="121.67" rx="5.48" ry="5.49"/><ellipse class="cls-2" cx="172.01" cy="147.07" rx="5.48" ry="5.49"/><ellipse class="cls-8" cx="155.99" cy="52.19" rx="5.49" ry="5.48" transform="translate(56.36 183.08) rotate(-71.22)"/><ellipse class="cls-8" cx="171.68" cy="94.01" rx="5.49" ry="5.48" transform="translate(27.41 226.29) rotate(-71.22)"/><ellipse class="cls-8" cx="172.54" cy="80.46" rx="5.49" ry="5.48" transform="translate(40.82 217.91) rotate(-71.22)"/><ellipse class="cls-5" cx="174.47" cy="33.22" rx="5.48" ry="5.49" transform="translate(50.94 160.82) rotate(-56.71)"/><ellipse class="cls-5" cx="182.99" cy="52.19" rx="5.39" ry="5.45" transform="translate(104.39 225.49) rotate(-81.52)"/><ellipse class="cls-4" cx="227.6" cy="65.94" rx="5.49" ry="5.48" transform="translate(142.9 287.25) rotate(-85.21)"/><ellipse class="cls-4" cx="218.87" cy="76.55" rx="5.49" ry="5.48" transform="translate(124.32 288.26) rotate(-85.21)"/><ellipse class="cls-9" cx="192.03" cy="42.04" rx="5.49" ry="5.48" transform="translate(134.11 229.89) rotate(-85.21)"/><ellipse class="cls-4" cx="223.23" cy="51.86" rx="5.49" ry="5.48" transform="translate(10.31 136.63) rotate(-34.79)"/><ellipse class="cls-4" cx="223.23" cy="36.63" rx="5.49" ry="5.48" transform="translate(19 133.91) rotate(-34.79)"/><ellipse class="cls-9" cx="206.14" cy="51.86" rx="5.49" ry="5.48" transform="translate(7.25 126.88) rotate(-34.79)"/><ellipse class="cls-9" cx="187.93" cy="28.39" rx="5.49" ry="5.48" transform="translate(143.96 213.3) rotate(-85.21)"/><ellipse class="cls-9" cx="208.22" cy="37.21" rx="5.49" ry="5.48" transform="translate(15.98 125.44) rotate(-34.79)"/></g></g><text class="cls-11" transform="translate(266.28 157.89)"><tspan x="0" y="0">Batch 1</tspan></text><text class="cls-11" transform="translate(266.28 177.87)"><tspan x="0" y="0">Batch 2</tspan></text><ellipse class="cls-7" cx="256.62" cy="151.57" rx="6.5" ry="6.48" transform="translate(-32.11 77.68) rotate(-16.21)"/><rect class="cls-7" x="250.45" y="165.52" width="12.34" height="13"/><g><polyline class="cls-3" points="431.8 171.88 387.75 171.88 387.75 130.19"/><text class="cls-11" transform="translate(387.75 188.34)"><tspan x="0" y="0">dim-2</tspan></text><text class="cls-11" transform="translate(383.03 171.88) rotate(-90)"><tspan x="0" y="0">dim-1</tspan></text></g><g><polyline class="cls-3" points="72.34 171.88 28.28 171.88 28.28 130.19"/><text class="cls-11" transform="translate(28.29 188.34)"><tspan x="0" y="0">dim-2</tspan></text><text class="cls-11" transform="translate(23.57 171.88) rotate(-90)"><tspan x="0" y="0">dim-1</tspan></text></g><g><line class="cls-1" x1="246.33" y1="106.88" x2="330.15" y2="106.88"/><polygon class="cls-10" points="327.96 114.36 340.92 106.88 327.96 99.4 327.96 114.36"/></g></svg>
\ No newline at end of file