diff --git a/_viash.yaml b/_viash.yaml index 83d3568..ed50565 100644 --- a/_viash.yaml +++ b/_viash.yaml @@ -2,7 +2,7 @@ viash_version: 0.9.0 # Step 1: Change the name of the task. # example: task_name_of_this_task -name: task_template +name: task_foundation_models organization: openproblems-bio version: dev @@ -11,8 +11,8 @@ license: MIT keywords: [single-cell, openproblems, benchmark] # Step 3: Update the `task_template` to the name of the task from step 1. links: - issue_tracker: https://github.com/openproblems-bio/task_template/issues - repository: https://github.com/openproblems-bio/task_template + issue_tracker: https://github.com/openproblems-bio/task_foundation_models/issues + repository: https://github.com/openproblems-bio/task_foundation_models docker_registry: ghcr.io @@ -50,11 +50,8 @@ info: # Step 5: Replace the task_template to the name of the task. test_resources: - type: s3 - path: s3://openproblems-data/resources_test/common/ - dest: resources_test/common - - type: s3 - path: s3://openproblems-data/resources_test/task_template/ - dest: resources_test/task_template + path: s3://openproblems-data/resources_test/task_foundation_models/ + dest: resources_test/task_foundation_models # Step 6: Update the authors of the task. authors: @@ -81,8 +78,7 @@ config_mods: | .runners[.type == "nextflow"].config.labels := { lowmem : "memory = 20.Gb", midmem : "memory = 50.Gb", highmem : "memory = 100.Gb", lowcpu : "cpus = 5", midcpu : "cpus = 15", highcpu : "cpus = 30", lowtime : "time = 1.h", midtime : "time = 4.h", hightime : "time = 8.h", veryhightime : "time = 24.h" } repositories: - - name: core + - name: openproblems type: github - repo: openproblems-bio/core + repo: openproblems-bio/openproblems tag: build/main - path: viash/core diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index 9cb372a..5eeb639 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -6,48 +6,38 @@ REPO_ROOT=$(git rev-parse --show-toplevel) # ensure that the command below is run from the root of the repository cd "$REPO_ROOT" -# # remove this when you have implemented the script -# echo "TODO: replace the commands in this script with the sequence of components that you need to run to generate test_resources." -# echo " Inside this script, you will need to place commands to generate example files for each of the 'src/api/file_*.yaml' files." -# exit 1 - set -e -RAW_DATA=resources_test/common -DATASET_DIR=resources_test/task_template - -mkdir -p $DATASET_DIR - -# process dataset -viash run src/data_processors/process_dataset/config.vsh.yaml -- \ - --input $RAW_DATA/cxg_mouse_pancreas_atlas/dataset.h5ad \ - --output_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \ - --output_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \ - --output_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad - -# run one method -viash run src/methods/logistic_regression/config.vsh.yaml -- \ - --input_train $DATASET_DIR/cxg_mouse_pancreas_atlas/train.h5ad \ - --input_test $DATASET_DIR/cxg_mouse_pancreas_atlas/test.h5ad \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/prediction.h5ad - -# run one metric -viash run src/metrics/accuracy/config.vsh.yaml -- \ - --input_prediction $DATASET_DIR/cxg_mouse_pancreas_atlas/prediction.h5ad \ - --input_solution $DATASET_DIR/cxg_mouse_pancreas_atlas/solution.h5ad \ - --output $DATASET_DIR/cxg_mouse_pancreas_atlas/score.h5ad - -# write manual state.yaml. this is not actually necessary but you never know it might be useful -cat > $DATASET_DIR/cxg_mouse_pancreas_atlas/state.yaml << HERE -id: cxg_mouse_pancreas_atlas -train: !file train.h5ad -test: !file test.h5ad -solution: !file solution.h5ad -prediction: !file prediction.h5ad -score: !file score.h5ad -HERE +OUT_DIR=resources_test/task_foundation_models/results + +TASKS=( + "task_label_projection" + "task_batch_integration" +) + +if [ -d "$OUT_DIR" ]; then + echo "Removing existing directory '$OUT_DIR'" + rm -rf "$OUT_DIR" +fi + +mkdir -p "$OUT_DIR" + +for TASK in "${TASKS[@]}"; do + BASE_DIR="s3://openproblems-data/resources/$TASK/results" + + # find subdir in bucket with latest date which has a 'task_info.yaml' file + DATE=$(aws s3 ls "$BASE_DIR/" --recursive --no-sign-request | awk '{print $4}' | grep 'task_info.yaml' | sort -r | head -n 1 | sed 's#.*/run_\(.*\)/[^/]*$#\1#') + + INPUT_DIR="$BASE_DIR/run_$DATE" + TASK_STRIP_PREFIX=$(echo $TASK | sed 's/task_//') + OUTPUT_DIR="$OUT_DIR/$TASK_STRIP_PREFIX" + + echo "Syncing '$INPUT_DIR' to '$OUTPUT_DIR'" + aws s3 sync "$INPUT_DIR" "$OUTPUT_DIR" --delete --no-sign-request +done # only run this if you have access to the openproblems-data bucket aws s3 sync --profile op \ - "$DATASET_DIR" s3://openproblems-data/resources_test/task_template \ + "resources_test/task_foundation_models" \ + s3://openproblems-data/resources_test/task_foundation_models \ --delete --dryrun diff --git a/src/control_methods/true_labels/config.vsh.yaml b/src/control_methods/true_labels/config.vsh.yaml deleted file mode 100644 index 741e3f2..0000000 --- a/src/control_methods/true_labels/config.vsh.yaml +++ /dev/null @@ -1,59 +0,0 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test -__merge__: ../../api/comp_control_method.yaml - -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. -name: true_labels - -# A relatively short label, used when rendering visualisations (required) -label: True Labels -# A one sentence summary of how this method works (required). Used when -# rendering summary tables. -summary: "a positive control, solution labels are copied 1 to 1 to the predicted data." -# A multi-line description of how this component works (required). Used -# when rendering reference documentation. -description: | - A positive control, where the solution labels are copied 1 to 1 to the predicted data. - -# Metadata for your component -info: - # Which normalisation method this component prefers to use (required). - preferred_normalization: counts - -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. - -# Resources required to run the component -resources: - # The script of your component (required) - - type: python_script - path: script.py - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt - -engines: - # Specifications for the Docker image for this component. - - type: docker - image: openproblems/base_python:1.0.0 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . - # setup: - # - type: python - # packages: scib==1.1.5 - -runners: - # This platform allows running the component natively - - type: executable - # Allows turning the component into a Nextflow module / pipeline. - - type: nextflow - directives: - label: [midtime, lowmem, lowcpu] diff --git a/src/control_methods/true_labels/script.py b/src/control_methods/true_labels/script.py deleted file mode 100644 index 935f3af..0000000 --- a/src/control_methods/true_labels/script.py +++ /dev/null @@ -1,45 +0,0 @@ -import anndata as ad - -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - 'input_train': 'resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad', - 'input_test': 'resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad', - 'input_solution': 'resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'name': 'true_labels' -} -## VIASH END - -print('Reading input files', flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) -input_solution = ad.read_h5ad(par['input_solution']) - -print('Preprocess data', flush=True) -# ... preprocessing ... - -print('Train model', flush=True) -# ... train model ... - -print('Generate predictions', flush=True) -# ... generate predictions ... -obs_label_pred = input_solution.obs["label"] - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': input_train.uns['dataset_id'], - 'normalization_id': input_train.uns['normalization_id'], - 'method_id': meta['name'] - }, - obs={ - 'label_pred': obs_label_pred - } -) -output.obs_names = input_test.obs_names - -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml deleted file mode 100644 index a997720..0000000 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ /dev/null @@ -1,34 +0,0 @@ -__merge__: ../../api/comp_data_processor.yaml -name: process_dataset -arguments: - - name: "--method" - type: "string" - description: "The process method to assign train/test." - choices: ["batch", "random"] - default: "batch" - - name: "--obs_label" - type: "string" - description: "Which .obs slot to use as label." - default: "cell_type" - - name: "--obs_batch" - type: "string" - description: "Which .obs slot to use as batch covariate." - default: "batch" - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 -resources: - - type: python_script - path: script.py - - path: /common/helper_functions/subset_h5ad_by_format.py - -engines: - - type: docker - image: openproblems/base_python:1.0.0 - -runners: - - type: executable - - type: nextflow - directives: - label: [highmem, midcpu, midtime] \ No newline at end of file diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py deleted file mode 100644 index 3eb56c2..0000000 --- a/src/data_processors/process_dataset/script.py +++ /dev/null @@ -1,86 +0,0 @@ -import sys -import random -import numpy as np -import anndata as ad -import openproblems as op - -## VIASH START -par = { - 'input': 'resources_test/common/cxg_mouse_pancreas_atlas/dataset.h5ad', - 'method': 'batch', - 'seed': None, - 'obs_batch': 'batch', - 'obs_label': 'cell_type', - 'output_train': 'train.h5ad', - 'output_test': 'test.h5ad', - 'output_solution': 'solution.h5ad' -} -meta = { - 'resources_dir': 'target/executable/data_processors/process_dataset', - 'config': 'target/executable/data_processors/process_dataset/.config.vsh.yaml' -} -## VIASH END - -# import helper functions -sys.path.append(meta['resources_dir']) -from subset_h5ad_by_format import subset_h5ad_by_format - -config = op.project.read_viash_config(meta["config"]) - -# set seed if need be -if par["seed"]: - print(f">> Setting seed to {par['seed']}") - random.seed(par["seed"]) - -print(">> Load data", flush=True) -adata = ad.read_h5ad(par["input"]) -print("input:", adata) - -print(f">> Process data using {par['method']} method") -if par["method"] == "batch": - batch_info = adata.obs[par["obs_batch"]] - batch_categories = batch_info.dtype.categories - test_batches = random.sample(list(batch_categories), 1) - is_test = [ x in test_batches for x in batch_info ] -elif par["method"] == "random": - train_ix = np.random.choice(adata.n_obs, round(adata.n_obs * 0.8), replace=False) - is_test = [ not x in train_ix for x in range(0, adata.n_obs) ] - -# subset the different adatas -print(">> Figuring which data needs to be copied to which output file", flush=True) -# use par arguments to look for label and batch value in different slots -slot_mapping = { - "obs": { - "label": par["obs_label"], - "batch": par["obs_batch"], - } -} - -print(">> Creating train data", flush=True) -output_train = subset_h5ad_by_format( - adata[[not x for x in is_test]], - config, - "output_train", - slot_mapping -) - -print(">> Creating test data", flush=True) -output_test = subset_h5ad_by_format( - adata[is_test], - config, - "output_test", - slot_mapping -) - -print(">> Creating solution data", flush=True) -output_solution = subset_h5ad_by_format( - adata[is_test], - config, - "output_solution", - slot_mapping -) - -print(">> Writing data", flush=True) -output_train.write_h5ad(par["output_train"]) -output_test.write_h5ad(par["output_test"]) -output_solution.write_h5ad(par["output_solution"]) diff --git a/src/methods/logistic_regression/config.vsh.yaml b/src/methods/logistic_regression/config.vsh.yaml deleted file mode 100644 index 479aa3a..0000000 --- a/src/methods/logistic_regression/config.vsh.yaml +++ /dev/null @@ -1,79 +0,0 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test -__merge__: ../../api/comp_method.yaml - - -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. -name: logistic_regression -# A relatively short label, used when rendering visualisations (required) -label: Logistic Regression -# A one sentence summary of how this method works (required). Used when -# rendering summary tables. -summary: "Logistic Regression with 100-dimensional PCA coordinates estimates parameters for multivariate classification by minimizing cross entropy loss over cell type classes." -# A multi-line description of how this component works (required). Used -# when rendering reference documentation. -description: | - Logistic Regression estimates parameters of a logistic function for - multivariate classification tasks. Here, we use 100-dimensional whitened PCA - coordinates as independent variables, and the model minimises the cross - entropy loss over all cell type classes. -# Metadata for your component -# A reference key from the bibtex library at src/common/library.bib (required). -references: - bibtex: - - | - @book{hosmer2013applied, - title = {Applied logistic regression}, - author = {Hosmer Jr, D.W. and Lemeshow, S. and Sturdivant, R.X.}, - year = {2013}, - publisher = {John Wiley \& Sons}, - volume = {398} - } - -links: - # URL to the code repository for this method (required). - repository: https://github.com/scikit-learn/scikit-learn - # URL to the documentation for this method (required). - documentation: "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" - -info: - # Which normalisation method this component prefers to use (required). - preferred_normalization: log_cp10k - -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. - -# Resources required to run the component -resources: - # The script of your component (required) - - type: python_script - path: script.py - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt - -engines: - # Specifications for the Docker image for this component. - - type: docker - image: openproblems/base_python:1.0.0 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . - setup: - - type: python - packages: scikit-learn - -runners: - # This platform allows running the component natively - - type: executable - # Allows turning the component into a Nextflow module / pipeline. - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/methods/logistic_regression/script.py b/src/methods/logistic_regression/script.py deleted file mode 100644 index 6ab5782..0000000 --- a/src/methods/logistic_regression/script.py +++ /dev/null @@ -1,46 +0,0 @@ -import anndata as ad -import sklearn.linear_model - -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - 'input_train': 'resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad', - 'input_test': 'resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'name': 'logistic_regression' -} -## VIASH END - -print('Reading input files', flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) - -print('Preprocess data', flush=True) -# ... preprocessing ... - -print('Train model', flush=True) -# ... train model ... -classifier = sklearn.linear_model.LogisticRegression() -classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) - -print('Generate predictions', flush=True) -# ... generate predictions ... -obs_label_pred = classifier.predict(input_test.obsm["X_pca"]) - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': input_train.uns['dataset_id'], - 'normalization_id': input_train.uns['normalization_id'], - 'method_id': meta['name'] - }, - obs={ - 'label_pred': obs_label_pred - } -) -output.obs_names = input_test.obs_names - -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/metrics/accuracy/config.vsh.yaml b/src/metrics/accuracy/config.vsh.yaml deleted file mode 100644 index 66fa835..0000000 --- a/src/metrics/accuracy/config.vsh.yaml +++ /dev/null @@ -1,70 +0,0 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test -__merge__: ../../api/comp_metric.yaml - - -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. -name: accuracy - -# Metadata for your component -info: - metrics: - # A unique identifier for your metric (required). - # Can contain only lowercase letters or underscores. - - name: accuracy - # A relatively short label, used when rendering visualisarions (required) - label: Accuracy - # A one sentence summary of how this metric works (required). Used when - # rendering summary tables. - summary: "The percentage of correctly predicted labels." - # A multi-line description of how this component works (required). Used - # when rendering reference documentation. - description: | - The percentage of correctly predicted labels. - # A reference key from the bibtex library at src/common/library.bib (required). - references: - doi: 10.48550/arXiv.2008.05756 - # The minimum possible value for this metric (required) - min: 0 - # The maximum possible value for this metric (required) - max: 1 - # Whether a higher value represents a 'better' solution (required) - maximize: true - -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. - -# Resources required to run the component -resources: - # The script of your component (required) - - type: python_script - path: script.py - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt - -engines: - # Specifications for the Docker image for this component. - - type: docker - image: openproblems/base_python:1.0.0 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . - setup: - - type: python - packages: scikit-learn - -runners: - # This platform allows running the component natively - - type: executable - # Allows turning the component into a Nextflow module / pipeline. - - type: nextflow - directives: - label: [midtime, midmem, midcpu] diff --git a/src/metrics/accuracy/script.py b/src/metrics/accuracy/script.py deleted file mode 100644 index 054e809..0000000 --- a/src/metrics/accuracy/script.py +++ /dev/null @@ -1,47 +0,0 @@ -import anndata as ad -import numpy as np -import sklearn.preprocessing - -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - 'input_solution': 'resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad', - 'input_prediction': 'resources_test/task_template/cxg_mouse_pancreas_atlas/prediction.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'name': 'accuracy' -} -## VIASH END - -print('Reading input files', flush=True) -input_solution = ad.read_h5ad(par['input_solution']) -input_prediction = ad.read_h5ad(par['input_prediction']) - -assert (input_prediction.obs_names == input_solution.obs_names).all(), "obs_names not the same in prediction and solution inputs" - -print("Encode labels", flush=True) -cats = list(input_solution.obs["label"].dtype.categories) + list(input_prediction.obs["label_pred"].dtype.categories) -encoder = sklearn.preprocessing.LabelEncoder().fit(cats) -input_solution.obs["label"] = encoder.transform(input_solution.obs["label"]) -input_prediction.obs["label_pred"] = encoder.transform(input_prediction.obs["label_pred"]) - - -print('Compute metrics', flush=True) -# metric_ids and metric_values can have length > 1 -# but should be of equal length -uns_metric_ids = [ 'accuracy' ] -uns_metric_values = np.mean(input_solution.obs["label"] == input_prediction.obs["label_pred"]) - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': input_prediction.uns['dataset_id'], - 'normalization_id': input_prediction.uns['normalization_id'], - 'method_id': input_prediction.uns['method_id'], - 'metric_ids': uns_metric_ids, - 'metric_values': uns_metric_values - } -) -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml deleted file mode 100644 index 64ff740..0000000 --- a/src/workflows/process_datasets/config.vsh.yaml +++ /dev/null @@ -1,38 +0,0 @@ -name: process_datasets -namespace: workflows - -argument_groups: - - name: Inputs - arguments: - - name: "--input" - __merge__: /src/api/file_common_dataset.yaml - required: true - direction: input - - name: Outputs - arguments: - - name: "--output_train" - __merge__: /src/api/file_train.yaml - required: true - direction: output - - name: "--output_test" - __merge__: /src/api/file_test.yaml - required: true - direction: output - - name: "--output_solution" - __merge__: /src/api/file_solution.yaml - required: true - direction: output - -resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - path: /common/nextflow_helpers/helper.nf - -dependencies: - - name: schema/verify_data_structure - repository: core - - name: data_processors/process_dataset - -runners: - - type: nextflow diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf deleted file mode 100644 index 25d1260..0000000 --- a/src/workflows/process_datasets/main.nf +++ /dev/null @@ -1,55 +0,0 @@ -include { findArgumentSchema } from "${meta.resources_dir}/helper.nf" - -workflow auto { - findStates(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -workflow run_wf { - take: - input_ch - - main: - output_ch = input_ch - - | verify_data_structure.run( - fromState: { id, state -> - def schema = findArgumentSchema(meta.config, "input") - def schemaYaml = tempFile("schema.yaml") - writeYaml(schema, schemaYaml) - [ - "input": state.input, - "schema": schemaYaml - ] - }, - toState: { id, output, state -> - // read the output to see if dataset passed the qc - def checks = readYaml(output.output) - state + [ - "dataset": checks["exit_code"] == 0 ? state.input : null, - ] - } - ) - - // remove datasets which didn't pass the schema check - | filter { id, state -> - state.dataset != null - } - - | process_dataset.run( - fromState: [ input: "dataset" ], - toState: [ - output_train: "output_train", - output_test: "output_test", - output_solution: "output_solution" - ] - ) - - // only output the files for which an output file was specified - | setState(["output_train", "output_test", "output_solution"]) - - emit: - output_ch -} diff --git a/src/workflows/process_datasets/test.sh b/src/workflows/process_datasets/test.sh deleted file mode 100755 index d918102..0000000 --- a/src/workflows/process_datasets/test.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/bin/bash - -# Run this prior to executing this script: -# bin/viash_build -q 'batch_integration' - -# get the root of the directory -REPO_ROOT=$(git rev-parse --show-toplevel) - -# ensure that the command below is run from the root of the repository -cd "$REPO_ROOT" - -set -e - -DATASETS_DIR="resources_test/common" -OUTPUT_DIR="output/process_datasets_test" - -if [ ! -d "$OUTPUT_DIR" ]; then - mkdir -p "$OUTPUT_DIR" -fi - -export NXF_VER=24.04.3 - -nextflow run . \ - -main-script target/nextflow/workflows/process_datasets/main.nf \ - -profile docker \ - -entry auto \ - -c common/nextflow_helpers/labels_ci.config \ - --id run_test \ - --input_states "$DATASETS_DIR/**/state.yaml" \ - --rename_keys 'input:output_dataset' \ - --settings '{"output_train": "train.h5ad", "output_test": "test.h5ad"}' \ - --publish_dir "$OUTPUT_DIR" \ - --output_state "state.yaml" \ No newline at end of file diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml deleted file mode 100644 index 904a2b6..0000000 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ /dev/null @@ -1,72 +0,0 @@ -name: run_benchmark -namespace: workflows - -argument_groups: - - name: Inputs - arguments: - - name: "--input_train" - __merge__: /src/api/file_train.yaml - type: file - direction: input - required: true - - name: "--input_test" - __merge__: /src/api/file_test.yaml - type: file - direction: input - required: true - - name: "--input_solution" - __merge__: /src/api/file_solution.yaml - type: file - direction: input - required: true - - name: Outputs - arguments: - - name: "--output_scores" - type: file - required: true - direction: output - description: A yaml file containing the scores of each of the methods - default: score_uns.yaml - - name: "--output_method_configs" - type: file - required: true - direction: output - default: method_configs.yaml - - name: "--output_metric_configs" - type: file - required: true - direction: output - default: metric_configs.yaml - - name: "--output_dataset_info" - type: file - required: true - direction: output - default: dataset_uns.yaml - - name: "--output_task_info" - type: file - required: true - direction: output - default: task_info.yaml - - name: Methods - arguments: - - name: "--method_ids" - type: string - multiple: true - description: A list of method ids to run. If not specified, all methods will be run. - -resources: - - type: nextflow_script - path: main.nf - entrypoint: run_wf - - type: file - path: /_viash.yaml - -dependencies: - - name: h5ad/extract_uns_metadata - repository: core - - name: control_methods/true_labels - - name: methods/logistic_regression - - name: metrics/accuracy - -runners: - - type: nextflow diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf deleted file mode 100644 index 826dec4..0000000 --- a/src/workflows/run_benchmark/main.nf +++ /dev/null @@ -1,191 +0,0 @@ -workflow auto { - findStates(params, meta.config) - | meta.workflow.run( - auto: [publish: "state"] - ) -} - -// construct list of methods and control methods -methods = [ - true_labels, - logistic_regression -] - -// construct list of metrics -metrics = [ - accuracy -] - -workflow run_wf { - take: - input_ch - - main: - - /**************************** - * EXTRACT DATASET METADATA * - ****************************/ - dataset_ch = input_ch - // store join id - | map{ id, state -> - [id, state + ["_meta": [join_id: id]]] - } - - // extract the dataset metadata - | extract_uns_metadata.run( - fromState: [input: "input_solution"], - toState: { id, output, state -> - state + [ - dataset_uns: readYaml(output.output).uns - ] - } - ) - - /*************************** - * RUN METHODS AND METRICS * - ***************************/ - score_ch = dataset_ch - - // run all methods - | runEach( - components: methods, - - // use the 'filter' argument to only run a method on the normalisation the component is asking for - filter: { id, state, comp -> - def norm = state.dataset_uns.normalization_id - def pref = comp.config.info.preferred_normalization - // if the preferred normalisation is none at all, - // we can pass whichever dataset we want - def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref - def method_check = !state.method_ids || state.method_ids.contains(comp.config.name) - - method_check && norm_check - }, - - // define a new 'id' by appending the method name to the dataset id - id: { id, state, comp -> - id + "." + comp.config.name - }, - - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: { id, state, comp -> - def new_args = [ - input_train: state.input_train, - input_test: state.input_test - ] - if (comp.config.info.type == "control_method") { - new_args.input_solution = state.input_solution - } - new_args - }, - - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - method_id: comp.config.name, - method_output: output.output - ] - } - ) - - // run all metrics - | runEach( - components: metrics, - id: { id, state, comp -> - id + "." + comp.config.name - }, - // use 'fromState' to fetch the arguments the component requires from the overall state - fromState: [ - input_solution: "input_solution", - input_prediction: "method_output" - ], - // use 'toState' to publish that component's outputs to the overall state - toState: { id, output, state, comp -> - state + [ - metric_id: comp.config.name, - metric_output: output.output - ] - } - ) - - // extract the scores - | extract_uns_metadata.run( - key: "extract_scores", - fromState: [input: "metric_output"], - toState: { id, output, state -> - state + [ - score_uns: readYaml(output.output).uns - ] - } - ) - - | joinStates { ids, states -> - // store the scores in a file - def score_uns = states.collect{it.score_uns} - def score_uns_yaml_blob = toYamlBlob(score_uns) - def score_uns_file = tempFile("score_uns.yaml") - score_uns_file.write(score_uns_yaml_blob) - - ["output", [output_scores: score_uns_file]] - } - - /****************************** - * GENERATE OUTPUT YAML FILES * - ******************************/ - // TODO: can we store everything below in a separate helper function? - - // extract the dataset metadata - meta_ch = dataset_ch - // only keep one of the normalization methods - | filter{ id, state -> - state.dataset_uns.normalization_id == "log_cp10k" - } - | joinStates { ids, states -> - // store the dataset metadata in a file - def dataset_uns = states.collect{state -> - def uns = state.dataset_uns.clone() - uns.remove("normalization_id") - uns - } - def dataset_uns_yaml_blob = toYamlBlob(dataset_uns) - def dataset_uns_file = tempFile("dataset_uns.yaml") - dataset_uns_file.write(dataset_uns_yaml_blob) - - // store the method configs in a file - def method_configs = methods.collect{it.config} - def method_configs_yaml_blob = toYamlBlob(method_configs) - def method_configs_file = tempFile("method_configs.yaml") - method_configs_file.write(method_configs_yaml_blob) - - // store the metric configs in a file - def metric_configs = metrics.collect{it.config} - def metric_configs_yaml_blob = toYamlBlob(metric_configs) - def metric_configs_file = tempFile("metric_configs.yaml") - metric_configs_file.write(metric_configs_yaml_blob) - - // store the task info in a file - def viash_file = meta.resources_dir.resolve("_viash.yaml") - - // create output state - def new_state = [ - output_dataset_info: dataset_uns_file, - output_method_configs: method_configs_file, - output_metric_configs: metric_configs_file, - output_task_info: viash_file, - _meta: states[0]._meta - ] - - ["output", new_state] - } - - // merge all of the output data - output_ch = score_ch - | mix(meta_ch) - | joinStates{ ids, states -> - def mergedStates = states.inject([:]) { acc, m -> acc + m } - [ids[0], mergedStates] - } - - emit: - output_ch -}