scgpt added/workflow updated

openproblems-bio · Nov 18, 2024 · 403533f · 403533f
1 parent 662b882
commit 403533f
Show file tree

Hide file tree

Showing 13 changed files with 117 additions and 266 deletions.
diff --git a/runs.ipynb b/runs.ipynb
@@ -175,14 +175,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Submitted batch job 7838763\n"
+      "Submitted batch job 7838786\n"
      ]
     }
    ],
@@ -194,6 +194,86 @@
     "    calculate_scores()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "--2024-11-18 22:11:00--  https://drive.google.com/uc?export=download&id=1CPVtpWUJ2nkI9jGignlHLcefBe6Gk-F9\n",
+      "Resolving drive.google.com (drive.google.com)... 173.194.79.113, 173.194.79.139, 173.194.79.138, ...\n",
+      "Connecting to drive.google.com (drive.google.com)|173.194.79.113|:443... connected.\n",
+      "HTTP request sent, awaiting response... 303 See Other\n",
+      "Location: https://drive.usercontent.google.com/download?id=1CPVtpWUJ2nkI9jGignlHLcefBe6Gk-F9&export=download [following]\n",
+      "--2024-11-18 22:11:00--  https://drive.usercontent.google.com/download?id=1CPVtpWUJ2nkI9jGignlHLcefBe6Gk-F9&export=download\n",
+      "Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 173.194.69.132, 2a00:1450:4013:c04::84\n",
+      "Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|173.194.69.132|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 2425 (2.4K) [text/html]\n",
+      "Saving to: ‘output/best_model.pt’\n",
+      "\n",
+      "     0K ..                                                    100% 44.8M=0s\n",
+      "\n",
+      "2024-11-18 22:11:00 (44.8 MB/s) - ‘output/best_model.pt’ saved [2425/2425]\n",
+      "\n",
+      "--2024-11-18 22:11:00--  https://drive.google.com/file/d/1Qzb6Y9UB342a2QxmY-BCubSvcmYZ5jw3/view?usp=drive_link\n",
+      "Resolving drive.google.com (drive.google.com)... 173.194.79.139, 173.194.79.138, 173.194.79.102, ...\n",
+      "Connecting to drive.google.com (drive.google.com)|173.194.79.139|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: unspecified [text/html]\n",
+      "Saving to: ‘output/vocab.json’\n",
+      "\n",
+      "     0K .......... .......... .......... .......... .......... 2.51M\n",
+      "    50K .......... .......... .......... .......... .          5.45M=0.03s\n",
+      "\n",
+      "2024-11-18 22:11:01 (3.32 MB/s) - ‘output/vocab.json’ saved [93749]\n",
+      "\n",
+      "--2024-11-18 22:11:01--  https://drive.google.com/file/d/1VwPGHuSorVAXyTreMFI1yzMougtUDeUt/view?usp=drive_link\n",
+      "Resolving drive.google.com (drive.google.com)... 173.194.79.138, 173.194.79.102, 173.194.79.100, ...\n",
+      "Connecting to drive.google.com (drive.google.com)|173.194.79.138|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: unspecified [text/html]\n",
+      "Saving to: ‘output/args.json’\n",
+      "\n",
+      "     0K .......... .......... .......... .......... .......... 2.49M\n",
+      "    50K .......... .......... .......... .......... .          5.16M=0.03s\n",
+      "\n",
+      "2024-11-18 22:11:01 (3.25 MB/s) - ‘output/args.json’ saved [93798]\n",
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "CompletedProcess(args=\"wget --no-check-certificate 'https://drive.google.com/file/d/1VwPGHuSorVAXyTreMFI1yzMougtUDeUt/view?usp=drive_link' -O output/args.json\", returncode=0)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "par = {'temp_dir': 'output'}\n",
+    "\n",
+    "par['model_file'] = f\"{par['temp_dir']}/best_model.pt\"\n",
+    "par['model_config_file'] = f\"{par['temp_dir']}/args.json\"\n",
+    "par['vocab_file'] = f\"{par['temp_dir']}/vocab.json\"\n",
+    "\n",
+    "\n",
+    "command = f\"wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1CPVtpWUJ2nkI9jGignlHLcefBe6Gk-F9' -O {par['model_file']}\"\n",
+    "subprocess.run(command, shell=True, check=True)\n",
+    "\n",
+    "command = f\"wget --no-check-certificate 'https://drive.google.com/file/d/1Qzb6Y9UB342a2QxmY-BCubSvcmYZ5jw3/view?usp=drive_link' -O {par['vocab_file']}\"\n",
+    "subprocess.run(command, shell=True, check=True)\n",
+    "\n",
+    "command = f\"wget --no-check-certificate 'https://drive.google.com/file/d/1VwPGHuSorVAXyTreMFI1yzMougtUDeUt/view?usp=drive_link' -O {par['model_config_file']}\"\n",
+    "subprocess.run(command, shell=True, check=True)\n"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 19,

diff --git a/...hods/single_omics/scgpt/config.novsh.yaml → ...ethods/single_omics/scgpt/config.vsh.yaml b/...hods/single_omics/scgpt/config.novsh.yaml → ...ethods/single_omics/scgpt/config.vsh.yaml
@@ -10,21 +10,6 @@ functionality:
       GRN inference using scGPT.
     documentation_url: https://github.com/bowang-lab/scGPT/blob/main/tutorials/Tutorial_Attention_GRN.ipynb 
   arguments:
-    - name: --model_file
-      type: file
-      direction: input
-      example: resources_test/supplementary/finetuned_scGPT_adamson/best_model.pt
-      default: resources_test/supplementary/finetuned_scGPT_adamson/best_model.pt
-    - name: --model_config_file
-      type: file
-      direction: input
-      example: resources_test/supplementary/finetuned_scGPT_adamson/args.json
-      default: resources_test/supplementary/finetuned_scGPT_adamson/args.json
-    - name: --vocab_file
-      type: file
-      direction: input
-      example: resources_test/supplementary/finetuned_scGPT_adamson/vocab.json
-      default: resources_test/supplementary/finetuned_scGPT_adamson/vocab.json
     - name: --n_bins
       type: integer
       direction: input

diff --git a/src/methods/single_omics/scgpt/script.py b/src/methods/single_omics/scgpt/script.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 import sys
 import warnings
+import subprocess
 
 import torch
 from anndata import AnnData
@@ -57,6 +58,22 @@
 }
 ## VIASH END
 
+# Download datasets 
+par['model_file'] = f"{par['temp_dir']}/best_model.pt"
+par['model_config_file'] = f"{par['temp_dir']}/args.json"
+par['vocab_file'] = f"{par['temp_dir']}/vocab.json"
+
+
+command = f"wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1CPVtpWUJ2nkI9jGignlHLcefBe6Gk-F9' -O {par['model_file']}"
+subprocess.run(command, shell=True, check=True)
+
+command = f"wget --no-check-certificate 'https://drive.google.com/file/d/1Qzb6Y9UB342a2QxmY-BCubSvcmYZ5jw3/view?usp=drive_link' -O {par['vocab_file']}"
+subprocess.run(command, shell=True, check=True)
+
+command = f"wget --no-check-certificate 'https://drive.google.com/file/d/1VwPGHuSorVAXyTreMFI1yzMougtUDeUt/view?usp=drive_link' -O {par['model_config_file']}"
+subprocess.run(command, shell=True, check=True)
+
+
 # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:50"
 initial_memory = torch.cuda.memory_allocated()
 def monitor_memory():

diff --git a/src/metrics/script_all.py b/src/metrics/script_all.py
@@ -16,7 +16,7 @@ def define_par(dataset):
     raise ValueError('define first')
 
   par = {
-      'reg_type': 'ridge',
+      'reg_type': 'GB',
       'models_dir': f"resources/grn_models/{dataset}",
       'scores_dir': f"output/temp/{dataset}",
 
@@ -77,7 +77,7 @@ def define_par(dataset):
 global_models = False
 
 # - run metrics 
-for dataset in ['norman', 'adamson']: #'replogle2', 'nakatake', norman
+for dataset in ['op','replogle2', 'nakatake', 'norman', 'adamson']: #'replogle2', 'nakatake', norman
   print('------ ', dataset, '------')
   par = define_par(dataset)
   os.makedirs(par['scores_dir'], exist_ok=True)

diff --git a/...tch_correction_evaluation/config.vsh.yaml → ...h_correction_evaluation/config.novsh.yaml b/...tch_correction_evaluation/config.vsh.yaml → ...h_correction_evaluation/config.novsh.yaml
diff --git a/...on/batch_correction_scgen/config.vsh.yaml → .../batch_correction_scgen/config.novsh.yaml b/...on/batch_correction_scgen/config.vsh.yaml → .../batch_correction_scgen/config.novsh.yaml
diff --git a/...n/batch_correction_seurat/config.vsh.yaml → ...batch_correction_seurat/config.novsh.yaml b/...n/batch_correction_seurat/config.vsh.yaml → ...batch_correction_seurat/config.novsh.yaml
diff --git a/src/workflows/process_perturbation/config.vsh.yaml b/src/workflows/process_perturbation/config.vsh.yaml
@@ -12,10 +12,10 @@ functionality:
       type: file 
       required: true
       direction: input
-      default: resources/datasets_raw/perturbation_counts.h5ad
+      example: resources_test/datasets_raw/op_perturbation_counts.h5ad
       description: single cell perturbation data 
 
-    - name: --perturbation_data_bc
+    - name: --perturbation_data_n
       __merge__: ../../api/file_evaluation_h5ad.yaml
       required: false
       direction: output
@@ -28,8 +28,8 @@ functionality:
   dependencies:
     - name: perturbation/sc_counts 
     - name: perturbation/normalization
-    - name: perturbation/batch_correction_scgen
-    - name: perturbation/batch_correction_seurat 
+    # - name: perturbation/batch_correction_scgen
+    # - name: perturbation/batch_correction_seurat 
 
 platforms:
   - type: nextflow

diff --git a/src/workflows/process_perturbation/main.nf b/src/workflows/process_perturbation/main.nf
@@ -14,18 +14,9 @@ workflow run_wf {
       fromState: [pseudobulked_data_f: "pseudobulked_data_f"],
       toState: [perturbation_data_n: "perturbation_data_n"]
     )
-
-    | batch_correction_scgen.run(
-      fromState: [perturbation_data_n: "perturbation_data_n"],
-      toState: [perturbation_data_bc: "perturbation_data_bc"]
-    )
 
-    | batch_correction_seurat.run(
-      fromState: [perturbation_data_n: "perturbation_data_bc"],
-      toState: [perturbation_data_bc: "perturbation_data_bc"]
-    )
 
-    | setState(["perturbation_data_bc"])
+    | setState(["perturbation_data_n"])
 
   emit:
   output_ch

diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
@@ -9,13 +9,13 @@ functionality:
   argument_groups:
     - name: Inputs
       arguments:
-        - name: --multiomics_rna
+        - name: --rna
           type: file
           direction: input
-        - name: --multiomics_atac
+        - name: --atac
           type: file
           direction: input
-        - name: --perturbation_data
+        - name: --evaluation_data
           type: file
           direction: input
         - name: --prediction
@@ -25,7 +25,7 @@ functionality:
         - name: --subsample
           type: integer
           direction: input
-          default: 2
+          default: -1
         - name: --reg_type
           type: string
           direction: input
@@ -49,15 +49,7 @@ functionality:
           required: false
           direction: input
           default: pearson
-        - name: --cell_type_specific
-          type: boolean
-          required: false
-          direction: input
-          default: true
-        - name: --normalize
-          type: boolean
-          required: false
-          direction: input        
+
     - name: Outputs
       arguments:
         - name: "--scores"
@@ -96,10 +88,11 @@ functionality:
     - name: grn_methods/portia
     - name: grn_methods/grnboost2
     - name: grn_methods/scenic
+
     # - name: grn_methods/genie3
 
     - name: grn_methods/ppcor #needs docker image
-    # - name: grn_methods/scgpt 
+    - name: grn_methods/scgpt 
 
     # ---- multiomics 
     - name: grn_methods/celloracle

diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
@@ -18,6 +18,7 @@ workflow run_wf {
     grnboost2,
     ppcor,
     scenic,
+    scglue,
 
     pearson_corr,
     negative_control,
@@ -77,12 +78,9 @@ workflow run_wf {
       },
       // use 'fromState' to fetch the arguments the component requires from the overall state
       fromState: [
-        multiomics_rna: "multiomics_rna",
-        multiomics_atac: "multiomics_atac",
+        rna: "rna",
+        atac: "atac",
         tf_all: "tf_all",
-        perturbation_data:"perturbation_data",
-        cell_type_specific:"cell_type_specific",
-        normalize:"normalize",
         num_workers:"num_workers"
 
       ],
@@ -107,7 +105,7 @@ workflow run_wf {
       },
       // use 'fromState' to fetch the arguments the component requires from the overall state
       fromState: [
-        perturbation_data: "perturbation_data",
+        evaluation_data: "evaluation_data",
         prediction: "prediction",
         method_id: "method_id", 
         subsample: "subsample",
@@ -116,7 +114,6 @@ workflow run_wf {
         consensus: "consensus",
         tf_all: "tf_all",
         layer:"layer",
-        cell_type_specific:"cell_type_specific"
       ],
       // use 'toState' to publish that component's outputs to the overall state
       toState: { id, output, state, comp ->