diff --git a/.github/workflows/test-mac-arm.yml b/.github/workflows/test-mac-arm.yml deleted file mode 100644 index 18a658c..0000000 --- a/.github/workflows/test-mac-arm.yml +++ /dev/null @@ -1,55 +0,0 @@ -on: - pull_request: - push: - branches: main - -name: Test on mac arm - -jobs: - build-deploy: - runs-on: macos-latest - - permissions: - contents: write - - steps: - - name: Check out repository - uses: actions/checkout@v4 - - - name: Set up specific version of Python - uses: actions/setup-python@v5 - with: - python-version: '3.12' - - - name: Install R - uses: r-lib/actions/setup-r@v2 - - - name: Set up Quarto - uses: quarto-dev/quarto-actions/setup@v2 - - - name: Make sure data is available - run: | - AWS_EC2_METADATA_DISABLED=true \ - aws s3 cp \ - --no-sign-request \ - s3://openproblems-bio/public/neurips-2023-competition/sc_counts_reannotated_with_counts.h5ad \ - book/usecase/data/sc_counts_reannotated_with_counts.h5ad - - # attempt with renv - - name: Set up renv - uses: r-lib/actions/setup-renv@v2 - - - name: Cache certain directories to speed up build - uses: actions/cache@v2 - with: - path: | - book/usecase/data - key: quarto_cache_osx_${{ github.ref_name }} - restore-keys: | - quarto_cache_osx_main - - - name: Render book - run: | - Rscript -e "renv::restore()" - source renv/python/virtualenvs/renv-python-3.12/bin/activate - quarto render diff --git a/book/in_memory_interoperability.qmd b/book/in_memory_interoperability.qmd index 64e1ec0..8bb2142 100644 --- a/book/in_memory_interoperability.qmd +++ b/book/in_memory_interoperability.qmd @@ -62,6 +62,10 @@ The `anndata2ri` package provides, apart from functionality to convert SingleCel TODO: how to subscript sparse matrix? Is it possible? +```{r include=FALSE} +library(SingleCellExperiment) +``` + ```{python rpy2_sparse} import scipy as sp diff --git a/book/on_disk_interoperability.qmd b/book/on_disk_interoperability.qmd index d934563..c2afc9e 100644 --- a/book/on_disk_interoperability.qmd +++ b/book/on_disk_interoperability.qmd @@ -11,29 +11,65 @@ The downside is that it can lead to increased storage requirements and I/O overh Debugging is only possible for individual scripts and the workflow in not as interactive and explorative as the in-memory approach. -## Different files +## Different file formats -Data format based interoperability +It's important to differentiate between language specific file formats and language agnostic file formats. For example, most languages can serialize objects to disk. R has the `.RDS` file format and Python has the `.pickle/.pkl` file format, but these are not interoperable. Older versions of the language could also have problems reading in serialized objects created by the latest language version. + +For a file format to be language agnostic, it should have a mature standard describing how the data is stored on disk. This standard should be implemented in multiple languages, documented and tested for compatibility. Some file formats have a reference implementation in C, which can be used to create bindings for other languages. Most also have a status page that list the implementations and which version or how much of the standard they support. + + + +### Dataframes + +| File Format | Support in Python | Support in R | Text-Based | Binary | High-Performance | Ease-of-Use | Compression | Suitable for Single-Cell | Suitable for Spatial | +|-------------|-------------------|--------------|------------|--------|------------------|-------------|-------------|--------------------------|----------------------| +| CSV | Yes | Yes | Yes | No | No | High | No | Limited | No | +| JSON | Yes | Yes | Yes | No | No | Medium | No | Limited | No | +| Parquet | Yes | Yes | No | Yes | Yes | High | Yes | Limited | No | +| Feather | Yes | Yes | No | Yes | Yes | High | Yes | Yes | No | +| HDF5 | Yes | Yes | No | Yes | Yes | Medium | Yes | Yes | Limited | + + +### n-dimensional arrays + +| File Format | Support in Python | Support in R | Text-Based | Binary | High-Performance | Ease-of-Use | Compression | Suitable for Single-Cell | Suitable for Spatial | +|-------------|-------------------|--------------|------------|--------|------------------|-------------|-------------|--------------------------|----------------------| +| h5ad | Yes | Limited | No | Yes | Yes | Medium | Yes | Yes | Yes | +| Zarr | Yes | Yes | No | Yes | Yes | Medium | Yes | Yes | Yes | +| NumPy (npy) | Yes | Limited | No | Yes | Yes | High | No | Yes | No | +| NetCDF | Yes | Yes | No | Yes | Yes | Medium | Yes | Yes | Yes | +| TIFF | Yes | Limited | No | Yes | Medium | Medium | Yes | No | Yes | -1. h5ad / zarr / Apache Arrow -2. Reading and writing these formats -TODO: comparison table ## Different on-disk pipelines You can use a shell script to run the pipeline in a sequential manner. This requires all the dependencies to be installed in one large environment. -TODO: compare notebook and script pipelines +Usually you start in a notebook with an exploratory analysis, then move to a script for reproducibility and finally to a pipeline for scalability. + +The scripts in such a script pipeline are a collection of the code snippets from the notebooks and can be written in different languages and executed in sequence. + +Alternatively, there are frameworks that keep the notebooks and create a pipeline with it. The upside is that you can avoid converting the code snippets in the notebooks to scripts. The downside is that you have to use a specific framework and the notebooks can become very large and unwieldy. ## Notebook pipelines -TODO: talk about Quarto, nb and papermill +You can use [Quarto](https://quarto.org/) to run code snippets in different languages in the same `.qmd` notebook. Our [Use-case chapter](./usecase/) is one example of this. + +For example, [Papermill]https://github.com/nteract/papermill) can execute Jupyter notebooks in sequence and pass variables between them. [Ploomber](https://github.com/ploomber/ploomber) is another example. + +### Execute notebooks via the CLI +Jupyter via [nbconvert](https://nbconvert.readthedocs.io/en/latest/#).: ```bash jupyter nbconvert --to notebook --execute my_notebook.ipynb --allow-errors --output-dir outputs/ ``` +[RMarkdown](https://rmarkdown.rstudio.com/): +```bash +Rscript -e "rmarkdown::render('my_notebook.Rmd',params=list(args = myarg))" +``` + ## Script pipelines ### Calling scripts in the same environment diff --git a/book/workflow_frameworks/examples/viash_nextflow/.gitignore b/book/workflow_frameworks/examples/viash_nextflow/.gitignore new file mode 100644 index 0000000..401d616 --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/.gitignore @@ -0,0 +1,3 @@ +/work +/target +/.nextflow* \ No newline at end of file diff --git a/book/workflow_frameworks/examples/viash_nextflow/_viash.yaml b/book/workflow_frameworks/examples/viash_nextflow/_viash.yaml new file mode 100644 index 0000000..6654ed6 --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/_viash.yaml @@ -0,0 +1,3 @@ +name: polygloty_usecase +version: 0.1.0 +viash_version: 0.9.0 \ No newline at end of file diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/compute_pseudobulk/config.vsh.yaml b/book/workflow_frameworks/examples/viash_nextflow/src/compute_pseudobulk/config.vsh.yaml new file mode 100644 index 0000000..7e486af --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/compute_pseudobulk/config.vsh.yaml @@ -0,0 +1,56 @@ +name: compute_pseudobulk +description: Compute pseudobulk expression from anndata object + +argument_groups: + - name: Inputs + arguments: + - type: file + name: --input + description: Path to the input h5ad file + example: /path/to/input.h5ad + required: true + - name: Pseudobulk arguments + arguments: + - type: string + name: --obs_column_index + description: Name of the column to pseudobulk on + example: cell_type + required: true + - type: string + name: --obs_column_values + description: List of column names for the new obs data frame + example: ["batch", "sample"] + multiple: true + required: true + - name: Outputs + arguments: + - type: file + name: --output + description: Path to the output h5ad file + example: /path/to/output.h5ad + required: true + direction: output + +resources: + - type: python_script + path: script.py + +test_resources: + - type: python_script + path: test.py + +engines: + - type: docker + image: python:3.10 + setup: + - type: python + pypi: + - anndata + test_setup: + - type: python + pypi: + - viashpy + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/compute_pseudobulk/script.py b/book/workflow_frameworks/examples/viash_nextflow/src/compute_pseudobulk/script.py new file mode 100644 index 0000000..90a6e07 --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/compute_pseudobulk/script.py @@ -0,0 +1,41 @@ +import anndata as ad +import pandas as pd +import numpy as np + +## VIASH START +par = {"input": "", "obs_column_index": "", "obs_column_values": [], "output": ""} +## VIASH END + +print("Load data", flush=True) +adata = ad.read_h5ad(par["input"]) + +print(f"Format of input data: {adata}", flush=True) +assert par["obs_column_index"] in adata.obs.columns, f"Column '{par['obs_column']}' not found in obs." +for col in par["obs_column_values"]: + assert col in adata.obs.columns, f"Column '{col}' not found in obs." + +print("Compute pseudobulk", flush=True) +X = adata.X +if not isinstance(X, np.ndarray): + X = X.toarray() +combined = pd.DataFrame( + X, + index=adata.obs[par["obs_column_index"]], +) +combined.columns = adata.var_names +pb_X = combined.groupby(level=0).sum() + +print("Construct obs for pseudobulk") +pb_obs = adata.obs[par["obs_column_values"]].copy() +pb_obs.index = adata.obs[par["obs_column_index"]] +pb_obs = pb_obs.drop_duplicates() + +print("Create AnnData object") +pb_adata = ad.AnnData( + X=pb_X.loc[pb_obs.index].values, + obs=pb_obs, + var=adata.var, +) + +print("Store to disk") +pb_adata.write_h5ad(par["output"], compression="gzip") diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/compute_pseudobulk/test.py b/book/workflow_frameworks/examples/viash_nextflow/src/compute_pseudobulk/test.py new file mode 100644 index 0000000..02cd7e9 --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/compute_pseudobulk/test.py @@ -0,0 +1,43 @@ +import sys +import anndata as ad +import pytest +import numpy as np + +def test_subset_var(run_component, tmp_path): + input_path = tmp_path / "input.h5ad" + output_path = tmp_path / "output.h5ad" + + # create data + adata_in = ad.AnnData( + X=np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), + obs={ + "cell_type": ["A", "A", "B", "B"], + "time": [1, 2, 1, 2], + "condition": ["ctrl", "ctrl", "trt", "trt"], + }, + var={"highly_variable": [True, False]}, + ) + + adata_in.write_h5ad(input_path) + + # run component + run_component([ + "--input", str(input_path), + "--obs_column_index", "cell_type", + "--obs_column_values", "condition", + "--output", str(output_path), + ]) + + # load output + adata_out = ad.read_h5ad(output_path) + + # check output + assert adata_out.X.shape == (2, 2) + assert np.all(adata_out.X == np.array([[4, 6], [12, 14]])) + assert adata_out.obs.index.tolist() == ["A", "B"] + assert adata_out.obs["condition"].tolist() == ["ctrl", "trt"] + assert adata_out.var["highly_variable"].tolist() == [True, False] + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/differential_expression/config.vsh.yaml b/book/workflow_frameworks/examples/viash_nextflow/src/differential_expression/config.vsh.yaml new file mode 100644 index 0000000..c7785fe --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/differential_expression/config.vsh.yaml @@ -0,0 +1,68 @@ +name: differential_expression +description: Compute differential expression between two observation types + +argument_groups: + - name: Inputs + arguments: + - type: file + name: --input + description: Path to the input h5ad file + example: /path/to/input.h5ad + required: true + - name: Differential expression arguments + arguments: + - type: string + name: --contrast + description: | + Contrast to compute. Must be of length 3: + + 1. The name of the column to contrast on + 2. The name of the first observation type + 3. The name of the second observation type + example: ["cell_type", "batch", "sample"] + multiple: true + required: true + - type: string + name: --design_formula + description: Design formula for the differential expression model + example: ~ batch + cell_type + - name: Outputs + arguments: + - type: file + name: --output + description: Path to the output h5ad file + example: /path/to/output.h5ad + required: true + direction: output + +resources: + - type: r_script + path: script.R + +test_resources: + - type: r_script + path: test.R + +engines: + - type: docker + image: rocker/r2u:22.04 + setup: + - type: apt + packages: + - python3 + - python3-pip + - python3-dev + - python-is-python3 + - type: python + pypi: + - anndata + - type: r + cran: + - anndata + - processx + bioc: + - DESeq2 + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/differential_expression/script.R b/book/workflow_frameworks/examples/viash_nextflow/src/differential_expression/script.R new file mode 100644 index 0000000..f91a50d --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/differential_expression/script.R @@ -0,0 +1,42 @@ +library(anndata) +requireNamespace("DESeq2", quietly = TRUE) + +## VIASH START +par <- list(input = "", contrast = c(), design_formula = "", output = "") +## VIASH END + +cat("Reading data\n") +adata <- read_h5ad(par$input) + +cat("Parse formula\n") +formula <- as.formula(par$design_formula) + +cat("Create DESeq dataset\n") +# transform counts matrix +count_data <- t(as.matrix(adata$X)) +storage.mode(count_data) <- "integer" + +# create dataset +dds <- DESeq2::DESeqDataSetFromMatrix( + countData = count_data, + colData = adata$obs, + design = formula +) + +cat("Run DESeq2\n") +dds <- DESeq2::DESeq(dds) + +res <- DESeq2::results(dds, contrast = par$contrast) |> + as.data.frame() + +cat("Write to disk\n") +contrast_names <- gsub(" ", "_", par$contrast) +contrast_names <- gsub("[^[:alnum:]]", "_", contrast_names) +contrast_names <- gsub("__", "_", contrast_names) +contrast_names <- tolower(contrast_names) + +varm_name <- paste0("de_", paste(contrast_names, collapse = "_")) +adata$varm[[varm_name]] <- res + +# Save adata +zzz <- adata$write_h5ad(par$output, compression = "gzip") diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/differential_expression/test.R b/book/workflow_frameworks/examples/viash_nextflow/src/differential_expression/test.R new file mode 100644 index 0000000..cfd135c --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/differential_expression/test.R @@ -0,0 +1,61 @@ +library(anndata) + +cat("Create input data\n") +X <- matrix(runif(100, 10, 100), nrow = 10, ncol = 10) +for (i in 1:10) { + X[1:5, i] <- X[1:5, i] + i * 10 +} +adata_in <- AnnData( + X = X, + obs = data.frame( + row.names = paste0("cell", 1:10), + sm_name = rep(c("Belinostat", "Dimethyl Sulfoxide"), each = 5), + plate_name = rep(c("plate1", "plate2"), times = 5) + ), + var = data.frame( + row.names = paste0("gene", 1:10) + ) +) + +cat("Write input data to file\n") +input_path <- "input.h5ad" +output_path <- "output.h5ad" +zzz <- adata_in$write_h5ad(input_path) + +cat("Run component\n") +zzz <- processx::run( + command = meta$executable, + args = c( + "--input", input_path, + "--contrast", "sm_name;Dimethyl Sulfoxide;Belinostat", + "--design_formula", "~ sm_name + plate_name", + "--output", output_path + ), + error_on_status = TRUE, + echo = TRUE +) + +cat("Read output data\n") +adata_out <- read_h5ad(output_path) + +cat("Preview output data\n") +print(adata_out) + +cat("Check DE results:\n") +de_out <- adata_out$varm$de_sm_name_dimethyl_sulfoxide_belinostat +if (is.null(de_out)) { + stop("No DE results found") +} + +print(de_out) + +expected_colnames <- c("baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj") +if (!all(colnames(de_out) == expected_colnames)) { + stop(paste0( + "Column names do not match.\n", + "Expected: ", paste(expected_colnames, collapse = ", "), "\n", + "Actual: ", paste(colnames(de_out), collapse = ", ") + )) +} + +cat("Done\n") \ No newline at end of file diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/load_data/config.vsh.yaml b/book/workflow_frameworks/examples/viash_nextflow/src/load_data/config.vsh.yaml new file mode 100644 index 0000000..683168a --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/load_data/config.vsh.yaml @@ -0,0 +1,31 @@ +name: load_data +description: Load data from an S3 bucket + +arguments: + - type: string + name: --url + description: URL to the data + example: s3://my-bucket/my-data.csv + required: true + - type: file + name: --output + description: Path to the output file + example: /path/to/output.csv + required: true + direction: output + +resources: + - type: bash_script + path: script.sh + +test_resources: + - type: bash_script + path: test.sh + +engines: + - type: docker + image: amazon/aws-cli + +runners: + - type: executable + - type: nextflow diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/load_data/script.sh b/book/workflow_frameworks/examples/viash_nextflow/src/load_data/script.sh new file mode 100644 index 0000000..c0aca9d --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/load_data/script.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +aws s3 cp \ + --no-sign-request \ + "$par_url" \ + "$par_output" diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/load_data/test.sh b/book/workflow_frameworks/examples/viash_nextflow/src/load_data/test.sh new file mode 100644 index 0000000..5c0e486 --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/load_data/test.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Run the executable +"$meta_executable" \ + --url s3://openproblems-bio/public/neurips-2023-competition/moa_annotations.csv \ + --output moa_annotations.csv + +# Check if the output file exists +if [[ ! -f moa_annotations.csv ]]; then + echo "File not found!" + exit 1 +fi + +# Check if the output file has the correct MD5 sum +if [[ "$(md5sum moa_annotations.csv | cut -d ' ' -f 1)" != "80ebe44ce6b8d73f31dbc653787089f9" ]]; then + echo "MD5 sum does not match!" + exit 1 +fi + +echo "All tests passed!" \ No newline at end of file diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/subset_obs/config.vsh.yaml b/book/workflow_frameworks/examples/viash_nextflow/src/subset_obs/config.vsh.yaml new file mode 100644 index 0000000..43ee865 --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/subset_obs/config.vsh.yaml @@ -0,0 +1,58 @@ +name: subset_obs +description: Subset the observations of an AnnData object + +argument_groups: + - name: Inputs + arguments: + - type: file + name: --input + description: Path to the input h5ad file + example: /path/to/input.h5ad + required: true + - name: Subsetting arguments + arguments: + - type: string + name: --obs_column + description: Name of the column to subset on + example: cell_type + required: true + - type: string + name: --obs_values + description: List of values to subset on. If column is a boolean, do not pass any values to this argument. + multiple: true + example: ["B cell", "T cell"] + - type: boolean_true + name: --invert + description: Invert the subset + - name: Outputs + arguments: + - type: file + name: --output + description: Path to the output h5ad file + example: /path/to/output.h5ad + required: true + direction: output + +resources: + - type: python_script + path: script.py + +test_resources: + - type: python_script + path: test.py + +engines: + - type: docker + image: python:3.10 + setup: + - type: python + pypi: + - anndata + test_setup: + - type: python + pypi: + - viashpy + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/subset_obs/script.py b/book/workflow_frameworks/examples/viash_nextflow/src/subset_obs/script.py new file mode 100644 index 0000000..3dcdcd4 --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/subset_obs/script.py @@ -0,0 +1,31 @@ +import anndata as ad + +## VIASH START +par = {"input": "", "obs_column": "", "obs_values": [], "invert": False, "output": ""} +## VIASH END + +print("Load data", flush=True) +adata = ad.read_h5ad(par["input"]) + +print(f"Format of input data: {adata}", flush=True) + +print("Subset data", flush=True) +filt = adata.obs[par["obs_column"]] + +# if filt is a list of booleans +assert (filt.dtype == bool) == (not par["obs_values"]), \ + f"If column '{par['obs_column']}' is boolean, 'obs_values' must be empty, and vice versa." + +if filt.dtype != bool: + # if filt is a list of strings + filt = filt.isin(par["obs_values"]) + +if par["invert"]: + filt = ~filt + +adata = adata[filt].copy() + +print(f"Format of output data: {adata}", flush=True) + +print("Store to disk", flush=True) +adata.write_h5ad(par["output"], compression="gzip") diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/subset_obs/test.py b/book/workflow_frameworks/examples/viash_nextflow/src/subset_obs/test.py new file mode 100644 index 0000000..5892fa1 --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/subset_obs/test.py @@ -0,0 +1,37 @@ +import sys +import anndata as ad +import pytest +import numpy as np + +def test_subset_var(run_component, tmp_path): + input_path = tmp_path / "input.h5ad" + output_path = tmp_path / "output.h5ad" + + # create data + adata_in = ad.AnnData( + X=np.random.rand(4, 2), + obs={"cell_type": ["A", "B", "C", "D"]}, + var={"highly_variable": [True, False]}, + ) + + adata_in.write_h5ad(input_path) + + # run component + run_component([ + "--input", str(input_path), + "--obs_column", "cell_type", + "--obs_values", "A;B", + "--output", str(output_path), + ]) + + # load output + adata_out = ad.read_h5ad(output_path) + + # check output + assert adata_out.X.shape == (2, 2) + assert adata_out.obs["cell_type"].tolist() == ["A", "B"] + assert adata_out.var["highly_variable"].tolist() == [True, False] + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/subset_var/config.vsh.yaml b/book/workflow_frameworks/examples/viash_nextflow/src/subset_var/config.vsh.yaml new file mode 100644 index 0000000..1c8cdc9 --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/subset_var/config.vsh.yaml @@ -0,0 +1,58 @@ +name: subset_var +description: Subset the variables of an AnnData object + +argument_groups: + - name: Inputs + arguments: + - type: file + name: --input + description: Path to the input h5ad file + example: /path/to/input.h5ad + required: true + - name: Subsetting arguments + arguments: + - type: string + name: --var_column + description: Name of the column to subset on + example: highly_variable + required: true + - type: string + name: --var_values + description: List of values to subset on. If column is a boolean, do not pass any values to this argument. + multiple: true + example: [] + - type: boolean_true + name: --invert + description: Invert the subset + - name: Outputs + arguments: + - type: file + name: --output + description: Path to the output h5ad file + example: /path/to/output.h5ad + required: true + direction: output + +resources: + - type: python_script + path: script.py + +test_resources: + - type: python_script + path: test.py + +engines: + - type: docker + image: python:3.10 + setup: + - type: python + pypi: + - anndata + test_setup: + - type: python + pypi: + - viashpy + +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/subset_var/script.py b/book/workflow_frameworks/examples/viash_nextflow/src/subset_var/script.py new file mode 100644 index 0000000..8086273 --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/subset_var/script.py @@ -0,0 +1,31 @@ +import anndata as ad + +## VIASH START +par = {"input": "", "var_column": "", "var_values": [], "invert": False, "output": ""} +## VIASH END + +print("Load data", flush=True) +adata = ad.read_h5ad(par["input"]) + +print(f"Format of input data: {adata}", flush=True) + +print("Subset data", flush=True) +filt = adata.var[par["var_column"]] + +# if filt is a list of booleans +assert (filt.dtype == bool) == (not par["var_values"]), \ + f"If column '{par['var_column']}' is boolean, 'var_values' must be empty, and vice versa." + +if filt.dtype != bool: + # if filt is a list of strings + filt = filt.isin(par["var_values"]) + +if par["invert"]: + filt = ~filt + +adata = adata[:, filt].copy() + +print(f"Format of output data: {adata}", flush=True) + +print("Store to disk", flush=True) +adata.write_h5ad(par["output"], compression="gzip") diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/subset_var/test.py b/book/workflow_frameworks/examples/viash_nextflow/src/subset_var/test.py new file mode 100644 index 0000000..0c4db92 --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/subset_var/test.py @@ -0,0 +1,36 @@ +import sys +import anndata as ad +import pytest +import numpy as np + +def test_subset_var(run_component, tmp_path): + input_path = tmp_path / "input.h5ad" + output_path = tmp_path / "output.h5ad" + + # create data + adata_in = ad.AnnData( + X=np.random.rand(4, 2), + obs={"cell_type": ["A", "B", "C", "D"]}, + var={"highly_variable": [True, False]}, + ) + + adata_in.write_h5ad(input_path) + + # run component + run_component([ + "--input", str(input_path), + "--var_column", "highly_variable", + "--output", str(output_path), + ]) + + # load output + adata_out = ad.read_h5ad(output_path) + + # check output + assert adata_out.X.shape == (4, 1) + assert adata_out.obs["cell_type"].tolist() == ["A", "B", "C", "D"] + assert adata_out.var["highly_variable"].tolist() == [True] + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__])) diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/workflow/config.vsh.yaml b/book/workflow_frameworks/examples/viash_nextflow/src/workflow/config.vsh.yaml new file mode 100644 index 0000000..e589477 --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/workflow/config.vsh.yaml @@ -0,0 +1,51 @@ +name: workflow +description: | + A workflow to compute differential expression between two groups of cells in an AnnData object. + +argument_groups: + - name: Inputs + arguments: + - type: file + name: --input + description: Path to the input h5ad file + example: s3://my-bucket/my-data.h5ad + required: true + - name: Differential expression arguments + arguments: + - type: string + name: --contrast + description: | + Contrast to compute. Must be of length 3: + + 1. The name of the column to contrast on + 2. The name of the first observation type + 3. The name of the second observation type + example: ["cell_type", "batch", "sample"] + multiple: true + required: true + - type: string + name: --design_formula + description: Design formula for the differential expression model + example: ~ batch + cell_type + - name: Outputs + arguments: + - type: file + name: --output + description: Path to the output h5ad file + example: /path/to/output.h5ad + required: true + direction: output + +resources: + - type: nextflow_script + path: main.nf + entrypoint: wf + +dependencies: + - name: subset_obs + - name: subset_var + - name: compute_pseudobulk + - name: differential_expression + +runners: + - type: nextflow diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/workflow/main.nf b/book/workflow_frameworks/examples/viash_nextflow/src/workflow/main.nf new file mode 100644 index 0000000..584d78e --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/workflow/main.nf @@ -0,0 +1,58 @@ +workflow wf { + take: + ch_in + + main: + ch_out = ch_in + + | subset_obs.run( + key: "subset_sm_name", + fromState: ["input": "input"], + args: [ + "obs_column": "sm_name", + "obs_values": ["Belinostat", "Dimethyl Sulfoxide"] + ], + toState: ["data": "output"] + ) + + | subset_obs.run( + key: "subset_cell_type", + fromState: ["input": "data"], + args: [ + "obs_column": "cell_type", + "obs_values": ["T cells"] + ], + toState: ["data": "output"] + ) + + | subset_var.run( + fromState: ["input": "data"], + args: [ + "var_column": "highly_variable", + ], + toState: ["data": "output"] + ) + + | compute_pseudobulk.run( + fromState: ["input": "data"], + args: [ + "obs_column_index": "plate_well_celltype_reannotated", + "obs_column_values": ["sm_name", "cell_type", "plate_name", "well"], + ], + toState: ["data": "output"] + ) + + | differential_expression.run( + fromState: [ + "input": "data", + "contrast": "contrast", + "design_formula": "design_formula" + ], + toState: ["data": "output"] + ) + + | setState(["output": "data"]) + + emit: + ch_out +} \ No newline at end of file diff --git a/book/workflow_frameworks/examples/viash_nextflow/src/workflow/run.sh b/book/workflow_frameworks/examples/viash_nextflow/src/workflow/run.sh new file mode 100755 index 0000000..6e624ef --- /dev/null +++ b/book/workflow_frameworks/examples/viash_nextflow/src/workflow/run.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +nextflow run \ + target/nextflow/workflow/main.nf \ + -with-docker \ + --id dataset \ + --input s3://openproblems-bio/public/neurips-2023-competition/sc_counts_reannotated_with_counts.h5ad \ + --contrast 'sm_name;Belinostat;Dimethyl Sulfoxide' \ + --design_formula '~ sm_name + plate_name' \ + --publish_dir output \ No newline at end of file diff --git a/renv.lock b/renv.lock index 95fa381..6d1ba6b 100644 --- a/renv.lock +++ b/renv.lock @@ -353,6 +353,22 @@ ], "Hash": "1716e201f81ced0f456dd5ec85fe20f8" }, + "SingleCellExperiment": { + "Package": "SingleCellExperiment", + "Version": "1.24.0", + "Source": "Bioconductor", + "Requirements": [ + "BiocGenerics", + "DelayedArray", + "GenomicRanges", + "S4Vectors", + "SummarizedExperiment", + "methods", + "stats", + "utils" + ], + "Hash": "e21b3571ce76eb4dfe6bf7900960aa6a" + }, "SparseArray": { "Package": "SparseArray", "Version": "1.2.4",