update api

openproblems-bio · Sep 21, 2024 · 2475ed7 · 2475ed7
1 parent 1262c52
commit 2475ed7
Show file tree

Hide file tree

Showing 6 changed files with 165 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -67,8 +67,10 @@ flowchart LR
   comp_control_method[/"Control method"/]
   comp_method[/"Method"/]
   comp_metric[/"Metric"/]
-  file_integrated("Integrated Dataset")
+  file_integrated("Integration")
   file_score("Score")
+  comp_transformer[/"Transform"/]
+  file_integrated_full("Transformed integration")
   file_common_dataset---comp_process_dataset
   comp_process_dataset-->file_dataset
   comp_process_dataset-->file_solution
@@ -79,7 +81,9 @@ flowchart LR
   comp_control_method-->file_integrated
   comp_method-->file_integrated
   comp_metric-->file_score
-  file_integrated---comp_metric
+  file_integrated---comp_transformer
+  comp_transformer-->file_integrated_full
+  file_integrated_full---comp_metric
 ```
 
 ## File format: Common Dataset
@@ -283,15 +287,15 @@ Arguments:
 
 <div class="small">
 
-| Name                 | Type   | Description                                    |
-|:---------------------|:-------|:-----------------------------------------------|
-| `--input_integrated` | `file` | An integrated AnnData dataset.                 |
-| `--input_solution`   | `file` | Uncensored dataset containing the true labels. |
-| `--output`           | `file` | (*Output*) Metric score file.                  |
+| Name | Type | Description |
+|:---|:---|:---|
+| `--input_integrated` | `file` | An integrated AnnData dataset with additional outputs. |
+| `--input_solution` | `file` | Uncensored dataset containing the true labels. |
+| `--output` | `file` | (*Output*) Metric score file. |
 
 </div>
 
-## File format: Integrated Dataset
+## File format: Integration
 
 An integrated AnnData dataset.
 
@@ -365,3 +369,66 @@ Data structure:
 
 </div>
 
+## Component type: Transform
+
+Transform batch integration outputs where necessary
+
+Arguments:
+
+<div class="small">
+
+| Name | Type | Description |
+|:---|:---|:---|
+| `--input` | `file` | An integrated AnnData dataset. |
+| `--output` | `file` | (*Output*) An integrated AnnData dataset with additional outputs. |
+
+</div>
+
+## File format: Transformed integration
+
+An integrated AnnData dataset with additional outputs.
+
+Example file:
+`resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad`
+
+Description:
+
+Must contain at least one of:
+
+- Feature: the corrected_counts layer
+- Embedding: the X_emb obsm
+- Graph: the connectivities and distances obsp
+
+The Graph should always be present, but the Feature and Embedding are
+optional.
+
+Format:
+
+<div class="small">
+
+    AnnData object
+     obsm: 'X_emb'
+     obsp: 'connectivities', 'distances'
+     layers: 'corrected_counts'
+     uns: 'dataset_id', 'normalization_id', 'dataset_organism', 'method_id', 'neighbors'
+
+</div>
+
+Data structure:
+
+<div class="small">
+
+| Slot | Type | Description |
+|:---|:---|:---|
+| `obsm["X_emb"]` | `double` | (*Optional*) Embedding output - 2D coordinate matrix. |
+| `obsp["connectivities"]` | `double` | Graph output - neighbor connectivities matrix. |
+| `obsp["distances"]` | `double` | Graph output - neighbor distances matrix. |
+| `layers["corrected_counts"]` | `double` | (*Optional*) Feature output - corrected counts. |
+| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. |
+| `uns["normalization_id"]` | `string` | Which normalization was used. |
+| `uns["dataset_organism"]` | `string` | (*Optional*) The organism of the sample in the dataset. |
+| `uns["method_id"]` | `string` | A unique identifier for the method. |
+| `uns["neighbors"]` | `object` | Supplementary K nearest neighbors data. |
+
+</div>
+
diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh
@@ -20,18 +20,23 @@ viash run src/process_dataset/config.vsh.yaml -- \
   --output_solution "$DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad"
 
 # run one method
-viash run src/methods/knn/config.vsh.yaml -- \
-    --input_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \
-    --input_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \
-    --output $DATASET_DIR/cxg_mouse_pancreas_atlas/prediction.h5ad
+viash run src/methods/combat/config.vsh.yaml -- \
+  --input $DATASET_DIR/cxg_mouse_pancreas_atlas/dataset.h5ad \
+  --output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad
+
+# run transformer
+viash run src/transformers/transform/config.vsh.yaml -- \
+    --input $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad \
+    --output $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated_full.h5ad
 
 # run one metric
 viash run src/metrics/accuracy/config.vsh.yaml -- \
-    --input_prediction $DATASET_DIR/cxg_mouse_pancreas_atlas/prediction.h5ad \
+    --input_prediction $DATASET_DIR/cxg_mouse_pancreas_atlas/integrated.h5ad \
     --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \
     --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad
 
 # only run this if you have access to the openproblems-data bucket
 aws s3 sync --profile op \
-  "$DATASET_DIR" s3://openproblems-data/resources_test/task_batch_integration \
+  "resources_test/task_batch_integration" \
+  s3://openproblems-data/resources_test/task_batch_integration \
   --delete --dryrun
diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml
@@ -8,7 +8,7 @@ info:
       A metric for evaluating batch integration methods.
 arguments:
   - name: --input_integrated
-    __merge__: file_integrated.yaml
+    __merge__: file_integrated_full.yaml
     direction: input
     required: true
   - name: --input_solution

diff --git a/src/api/comp_transformer.yaml b/src/api/comp_transformer.yaml
@@ -0,0 +1,22 @@
+namespace: transformers
+info:
+  type: transformer
+  type_info:
+    label: Transform
+    summary: Transform batch integration outputs where necessary
+    description: |
+      Transform corrected feature output to an embedding, and an embedding to a graph output.
+arguments:
+  - name: --input
+    __merge__: file_integrated.yaml
+    direction: input
+    required: true
+  - name: --output
+    __merge__: file_integrated_full.yaml
+    direction: output
+    required: true
+test_resources:
+  - type: python_script
+    path: /common/component_tests/run_and_check_output.py
+  - path: /resources_test/task_batch_integration/cxg_mouse_pancreas_atlas
+    dest: resources_test/task_batch_integration/cxg_mouse_pancreas_atlas
diff --git a/src/api/file_integrated.yaml b/src/api/file_integrated.yaml
@@ -1,6 +1,6 @@
 type: file
 example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated.h5ad"
-label: "Integrated Dataset"
+label: Integration
 summary: An integrated AnnData dataset.
 description: |
   Must contain at least one of:

diff --git a/src/api/file_integrated_full.yaml b/src/api/file_integrated_full.yaml
@@ -0,0 +1,55 @@
+type: file
+example: "resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad"
+label: Transformed integration
+summary: An integrated AnnData dataset with additional outputs.
+description: |
+  Must contain at least one of:
+
+    - Feature: the corrected_counts layer
+    - Embedding: the X_emb obsm
+    - Graph: the connectivities and distances obsp
+  
+  The Graph should always be present, but the Feature and Embedding are optional.
+info:
+  format:
+    type: h5ad
+    layers:
+      - type: double
+        name: corrected_counts
+        description: Feature output - corrected counts
+        required: false
+    obsm:
+      - type: double
+        name: X_emb
+        description: Embedding output - 2D coordinate matrix
+        required: false
+    obsp:
+      - type: double
+        name: connectivities
+        description: "Graph output - neighbor connectivities matrix"
+        required: true
+      - type: double
+        name: distances
+        description: "Graph output - neighbor distances matrix"
+        required: true
+    uns:
+      - type: string
+        name: dataset_id
+        description: "A unique identifier for the dataset"
+        required: true
+      - type: string
+        name: normalization_id
+        description: "Which normalization was used"
+        required: true
+      - name: dataset_organism
+        type: string
+        description: The organism of the sample in the dataset.
+        required: false
+      - type: string
+        name: method_id
+        description: "A unique identifier for the method"
+        required: true
+      - type: object
+        name: neighbors
+        description: Supplementary K nearest neighbors data.
+        required: true