test files rds added

openproblems-bio · Jul 23, 2024 · 75ba304 · 75ba304
1 parent 4b60ce1
commit 75ba304
Show file tree

Hide file tree

Showing 11 changed files with 77 additions and 72 deletions.
diff --git a/notebooks/create_resources.ipynb b/notebooks/create_resources.ipynb
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -166,23 +166,23 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# test rresources"
+    "# test resources"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
     "test_resource_dir = f'{resource_dir}/../../resources_test/grn-benchmark'\n",
-    "os.makedirs(test_resource_dir, exist_ok=True)"
+    "os.makedirs(test_resource_dir, exist_ok=True)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -192,7 +192,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -201,12 +201,14 @@
     "hvgs = ad.read_h5ad(f'{resource_dir}/prior_data.h5ad').uns['hvgs']\n",
     "genes_multi = ad.read_h5ad(f'{resource_dir}/prior_data.h5ad').uns['gene_names']\n",
     "tfs = ad.read_h5ad(f'{resource_dir}/prior_data.h5ad').uns['tf_list']\n",
-    "genes = set(tfs) & set(genes_multi)\n"
+    "genes = set(tfs) & set(genes_multi)\n",
+    "\n",
+    "peaks = np.random.choice(peaks, 1000)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -219,17 +221,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "View of AnnData object with n_obs × n_vars = 1000 × 4962\n",
+       "View of AnnData object with n_obs × n_vars = 1000 × 868\n",
        "    obs: 'cell_type', 'donor_id'"
       ]
      },
-     "execution_count": 77,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -242,7 +244,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [

diff --git a/src/methods/figr/script.R b/src/methods/figr/script.R
@@ -11,8 +11,8 @@ par <- list(
   multiomics_rna = "resources_test/grn-benchmark/multiomics_r/rna.rds",
   multiomics_atac = "resources_test/grn-benchmark/multiomics_r/atac.rds",
   temp_dir =  "output/figr/",
-  cell_topic = "resources/grn-benchmark/supp/cell_topic.csv",
-  num_workers = 40,
+  cell_topic = "resources_test/grn-benchmark/supp/cell_topic.csv",
+  num_workers = 1,
   n_topics = 48,
   peak_gene = "output/figr/peak_gene.csv",
   prediction= "output/figr/prediction.csv"
@@ -21,6 +21,7 @@ par <- list(
 #   functionality_name = "my_method_r"
 # )
 ## VIASH END
+dir.create(par$temp_dir, recursive = TRUE, showWarnings = TRUE)
 
 cellknn_func <- function(par) {
   ## load cell topic probabilities and create cell-cluster matrix
@@ -30,9 +31,9 @@ cellknn_func <- function(par) {
   cellkNN <- get.knn(cell_topic, k=par$n_topics)$nn.index
   rownames(cellkNN) <- rownames(cell_topic)
   print(dim(cellkNN))
+  print(paste0(par$temp_dir, "cellkNN.rds"))
   saveRDS(cellkNN, paste0(par$temp_dir, "cellkNN.rds"))
 }
-print(par)
 cellknn_func(par)
 
 ## Step1: Peak-gene association testing
@@ -48,6 +49,8 @@ peak_gene_func <- function(par){
   write.csv(cisCorr, paste0(par$temp_dir, "cisCorr.csv"), row.names = TRUE)
 }
 
+peak_gene_func(par)
+
 ## Step 2: create DORCs and smooth them 
 dorc_genes_func <- function(par){
   cisCorr = read.csv(paste0(par$temp_dir, "cisCorr.csv"))
@@ -61,10 +64,12 @@ dorc_genes_func <- function(par){
                           dorcTab = cisCorr.filt,
                           geneList = allGenes,
                           nCores = par$num_workers)
-
+  print(print(paste0(par$temp_dir, "cellkNN.rds")))
   cellkNN = readRDS(paste0(par$temp_dir, "cellkNN.rds"))
   # Smooth dorc scores using cell KNNs (k=n_topics)
   n_topics = par$n_topics
+  common_cells <- intersect(rownames(cellkNN), colnames(rna))
+  cellkNN = cellkNN[common_cells,]
   dorcMat.s <- smoothScoresNN(NNmat = cellkNN[,1:n_topics], mat = dorcMat, nCores = 4)
 
   # Smooth RNA using cell KNNs
@@ -76,6 +81,7 @@ dorc_genes_func <- function(par){
   saveRDS(RNAmat.s, paste0(par$temp_dir, "RNAmat.s.RDS"))
   saveRDS(dorcMat.s, paste0(par$temp_dir, "dorcMat.s.RDS"))
 }
+dorc_genes_func(par)
 
 ## TF-gene associations
 tf_gene_association_func <- function(par){
@@ -93,7 +99,7 @@ tf_gene_association_func <- function(par){
 
   write.csv(figR.d, paste0(par$temp_dir, "figR.d.csv"))
 }
-
+tf_gene_association_func(par)
 extract_peak_gene_func <- function(par) {
   # Read the CSV file
   peak_gene_figr <- read.csv(file.path(par$temp_dir, "cisCorr.filt.csv"))
@@ -117,6 +123,7 @@ extract_peak_gene_func <- function(par) {
   # Write the result to a CSV file
   write.csv(peak_gene_figr, file = par$peak_gene, row.names = FALSE)
 }
+extract_peak_gene_func(par)
 
 filter_figr_grn <- function(par) {
   # Read the CSV file
@@ -145,8 +152,4 @@ filter_figr_grn <- function(par) {
 }
 
 
-# peak_gene_func(par)
-# dorc_genes_func(par)
-# tf_gene_association_func(par)
-# extract_peak_gene_func(par)
-# filter_figr_grn(par)
+filter_figr_grn(par)
diff --git a/src/methods/scglue/main.py b/src/methods/scglue/main.py
@@ -71,6 +71,7 @@ def preprocess(rna, atac, par):
     nx.write_graphml(guidance, f"{par['temp_dir']}/guidance.graphml.gz")
 
 def training(par):
+    os.makedirs(f"{par['temp_dir']}/glue", exist_ok=True)
     rna = ad.read_h5ad(f"{par['temp_dir']}/rna.h5ad")
     atac = ad.read_h5ad(f"{par['temp_dir']}/atac.h5ad")
     guidance = nx.read_graphml(f"{par['temp_dir']}/guidance.graphml.gz")
@@ -82,6 +83,7 @@ def training(par):
         atac, "NB", use_highly_variable=False,
         use_rep="X_lsi"
     )
+
 
     glue = scglue.models.fit_SCGLUE(
         {"rna": rna, "atac": atac}, guidance,

diff --git a/src/methods/scglue/script.py b/src/methods/scglue/script.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import anndata as ad
 import sys
+import os
 
 ## VIASH START
 par = {
@@ -17,8 +18,10 @@
 #   "resources_dir":'resources'
 # }
 
+
 sys.path.append(meta["resources_dir"])
 from main import main 
+os.makedirs(par['temp_dir'], exist_ok=True)
 prediction = main(par)
 
 print('Write output to file', flush=True)

diff --git a/src/pre_methods/format_multiomics_R/script.R b/src/pre_methods/format_multiomics_R/script.R
diff --git a/src/process_data/adata_2_matrix/config.novsh.yaml b/src/process_data/adata_2_matrix/config.novsh.yaml
@@ -0,0 +1,29 @@
+functionality:
+  name: reformat_resources_r
+  info:
+    label: reformat_resources_r
+    summary: "Converts data to format needed for R methods."
+
+  arguments:
+    - name: --multiomics_rna
+      __merge__: ../../api/file_multiomics_rna_h5ad.yaml
+      required: True
+      direction: input
+    - name: --multiomics_atac
+      __merge__: ../../api/file_multiomics_atac_h5ad.yaml
+      required: false
+      direction: input
+
+  resources:
+    - type: python_script
+      path: script.py
+
+
+platforms:
+  - type: docker
+    image: ghcr.io/openproblems-bio/base_python:1.0.4
+
+  - type: native
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/...ethods/format_multiomics_R/format_data.py → src/process_data/adata_2_matrix/script.py b/...ethods/format_multiomics_R/format_data.py → src/process_data/adata_2_matrix/script.py
@@ -7,9 +7,9 @@
 from scipy.io import mmwrite
 
 par = {
-  "multiomics_rna": "resources/grn-benchmark/multiomics_rna.h5ad",
-  "multiomics_atac": "resources/grn-benchmark/multiomics_atac.h5ad",
-  "temp_dir": 'output/figr'
+  "multiomics_rna": "resources_test/grn-benchmark/multiomics_rna.h5ad",
+  "multiomics_atac": "resources_test/grn-benchmark/multiomics_atac.h5ad",
+  "temp_dir": 'resources_test/matrixdata'
 
 }
 
@@ -46,4 +46,6 @@ def format_data(par):
 
     print('Format data completed', flush=True)
 
-format_data(par)
+format_data(par)
+
+
diff --git a/src/pre_methods/cistopic/cistopic.py → src/process_data/cistopic/cistopic.py b/src/pre_methods/cistopic/cistopic.py → src/process_data/cistopic/cistopic.py
diff --git a/src/pre_methods/cistopic/config.novsh.yaml → src/process_data/cistopic/config.novsh.yaml b/src/pre_methods/cistopic/config.novsh.yaml → src/process_data/cistopic/config.novsh.yaml
diff --git a/...ods/format_multiomics_R/config.novsh.yaml → ...ata/format_multiomics_R/config.novsh.yaml b/...ods/format_multiomics_R/config.novsh.yaml → ...ata/format_multiomics_R/config.novsh.yaml
@@ -13,6 +13,13 @@ functionality:
       __merge__: ../../api/file_multiomics_atac_h5ad.yaml
       required: false
       direction: input
+
+    - name: --rna_rds
+      required: false
+      direction: output
+    - name: --atac_rna
+      required: false
+      direction: output
 
   resources:
     - type: r_script

diff --git a/...thods/format_multiomics_R/format_data_r.R → ..._data/format_multiomics_R/format_data_r.R b/...thods/format_multiomics_R/format_data_r.R → ..._data/format_multiomics_R/format_data_r.R
@@ -3,13 +3,12 @@ library(FNN)
 library(chromVAR)
 library(doParallel)
 library(FigR)
-library(optparse)
 
 # Example usage
 par <- list(
-    temp_dir = 'output/figr',
-    rna_rds = 'resources/grn-benchmark/multiomics_r/rna.rds',
-    atac_rds = 'resources/grn-benchmark/multiomics_r/atac.rds'
+    temp_dir = 'resources_test/matrixdata',
+    rna_rds = 'resources_test/grn-benchmark/multiomics_r/rna.rds',
+    atac_rds = 'resources_test/grn-benchmark/multiomics_r/atac.rds'
 )