scenic+ was modified to prevent biomart issue.

openproblems-bio · Oct 11, 2024 · 68ff6c4 · 68ff6c4
1 parent 0b42bb9
commit 68ff6c4
Show file tree

Hide file tree

Showing 12 changed files with 643 additions and 75 deletions.
diff --git a/runs.ipynb b/runs.ipynb
diff --git a/scripts/sbatch/figr.sh → scripts/sbatch/r_script.sh b/scripts/sbatch/figr.sh → scripts/sbatch/r_script.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-#SBATCH --job-name=figr
+#SBATCH --job-name=r_model
 #SBATCH --time=48:00:00
 #SBATCH --output=logs/%j.out
 #SBATCH --error=logs/%j.err
@@ -9,4 +9,5 @@
 #SBATCH --mem=250G
 
 
-singularity run ../../images/figr Rscript src/methods/multi_omics/figr/script.R 
+# singularity run ../../images/figr Rscript src/methods/multi_omics/figr/script.R 
+singularity run ../../images/ppcor Rscript src/methods/single_omics/ppcor/script.R 
diff --git a/src/control_methods/pearson/script.py b/src/control_methods/pearson/script.py
@@ -54,6 +54,7 @@
 from util import create_corr_net
 
 par['causal'] = True
+par['normalize'] = True
 net = create_corr_net(par)
 print('Output GRN')
 net.to_csv(par['prediction'])
diff --git a/src/control_methods/positive_control/script.py b/src/control_methods/positive_control/script.py
@@ -24,6 +24,8 @@
 
 print('Create causal corr net')
 par['causal'] = True
+par['normalize'] = True
+
 par['multiomics_rna'] = par['perturbation_data']
 net = create_corr_net(par)
 

diff --git a/src/exp_analysis/helper.py b/src/exp_analysis/helper.py
@@ -59,7 +59,7 @@ def calculate_feature_distance(sample: pd.DataFrame, control: pd.DataFrame, top_
 
     distance_df = pd.DataFrame({'distance_raw':distance_raw, 'distance_rank':distance_rank}, index=entries_common)
     return distance_df 
-def cosine_similarity(nets_dict, col_name='source', weight_col='weight', figsize=(4, 4), title='Cosine Similarity', ax=None):
+def cosine_similarity_net(nets_dict, col_name='source', weight_col='weight', figsize=(4, 4), title='Cosine Similarity', ax=None):
     from itertools import combinations
     from sklearn.metrics.pairwise import cosine_similarity
     import seaborn as sns
@@ -99,14 +99,14 @@ def cosine_similarity(nets_dict, col_name='source', weight_col='weight', figsize
 
     sns.heatmap(cosine_sim_matrix, annot=True, cmap="coolwarm", xticklabels=nets_names, yticklabels=nets_names, ax=ax)
     ax.grid(False)
-    ax.set_title(title)
+    ax.set_title(title, pad=20)
 
     # Rotate x labels for readability
     plt.xticks(rotation=45, ha='right')
 
     return cosine_sim_matrix, fig
 
-def jaccard_similarity(nets_dict, col_name='link', figsize=(4, 4), title='jaccard Similarity', ax=None):
+def jaccard_similarity_net(nets_dict, col_name='link', figsize=(4, 4), title='jaccard Similarity', ax=None):
     from itertools import combinations
     import seaborn as sns
     import matplotlib.pyplot as plt

diff --git a/src/utils/helper.py → src/helper.py b/src/utils/helper.py → src/helper.py
@@ -4,7 +4,6 @@
 import anndata as ad 
 import numpy as np
 import scanpy as sc 
-from src.exp_analysis.helper import plot_cumulative_density
 import sys 
 import subprocess
 import io
@@ -41,6 +40,15 @@ def check_scores(par):
     df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))
     df_scores['rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)
     return df_scores
+
+def calculate_scores():
+    command = [
+        "sbatch", 
+        "scripts/sbatch/calculate_scores.sh"
+    ] 
+
+    # Print command to verify
+    subprocess.run(command)
 
 
 def run_consensus(par):
@@ -55,7 +63,7 @@ def run_consensus(par):
     # Print command to verify
     subprocess.run(command)
 
-def run_grn_inference_seqera():
+def run_grn_seqera():
     # if False: # submit the job
     # !bash src/methods/multi_omics/celloracle/run.sh
     # if False: # get the results
@@ -74,13 +82,22 @@ def run_grn_inference_seqera():
     raise ValueError('define first')
 
 def run_grn_inference():
+    # par = {
+    #     'methods': ['scglue'],
+    #     'models_dir': 'resources/grn_models/',
+    #     'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad', 
+    #     'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad', 
+    #     'num_workers': 20,
+    #     'mem': "120GB",
+    #     'time': "48:00:00"
+    # }
     par = {
-        'methods': ['scglue'],
+        'methods': ['scenicplus'],
         'models_dir': 'resources/grn_models/',
         'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad', 
         'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad', 
         'num_workers': 20,
-        'mem': "120GB",
+        'mem': "250GB",
         'time': "48:00:00"
     }
 
@@ -126,6 +143,8 @@ def run_grn_inference():
         # Run sbatch command
         try:
             result = subprocess.run(['sbatch'] + full_tag + ['scripts/sbatch/grn_inference.sh', command], check=True, capture_output=True, text=True)
+            # result = subprocess.run(['bash'] + ['scripts/sbatch/grn_inference.sh', command], check=True, capture_output=True, text=True)
+
             print(f"Job {method} submitted successfully.")
             print(result.stdout)  # Print the standard output
         except subprocess.CalledProcessError as e:
@@ -297,4 +316,9 @@ def elapsed_to_hours(elapsed_str):
     # Convert MaxRSS and MaxVMSize from KB to GB
     df_local['MaxRSS'] = df_local['MaxRSS'] / (1024 ** 2)  # Convert KB to GB
     df_local['MaxVMSize'] = df_local['MaxVMSize'] / (1024 ** 2)  # Convert KB to GB
-    return df_local
+    return df_local
+
+
+if __name__ == '__main__':
+    # run_grn_inference()
+    calculate_scores()
diff --git a/src/methods/multi_omics/scenicplus/main.py b/src/methods/multi_omics/scenicplus/main.py
@@ -703,17 +703,20 @@ def preprocess_rna(par):
 def snakemake_pipeline(par):
     import os
     snakemake_dir = os.path.join(par['temp_dir'], 'scplus_pipeline', 'Snakemake')
-    if os.path.exists(snakemake_dir):
-        import shutil
-        shutil.rmtree(snakemake_dir)  # Replace 'snakemake_dir' with your directory path
+    # if os.path.exists(snakemake_dir):
+    #     import shutil
+    #     shutil.rmtree(snakemake_dir)  
 
     os.makedirs(os.path.join(par['temp_dir'], 'scplus_pipeline'), exist_ok=True)
     os.makedirs(os.path.join(par['temp_dir'], 'scplus_pipeline', 'temp'), exist_ok=True)
-    subprocess.run(['scenicplus', 'init_snakemake', '--out_dir', os.path.join(par['temp_dir'], 'scplus_pipeline')])
-    print('snake make initialized', flush=True)
+
+    pipeline_dir = os.path.join(par['temp_dir'], 'scplus_pipeline')
+    if not os.path.exists(pipeline_dir):
+        subprocess.run(['scenicplus', 'init_snakemake', '--out_dir', pipeline_dir])
+        print('snake make initialized', flush=True)
 
     # Load pipeline settings
-    with open(os.path.join(par['temp_dir'], 'scplus_pipeline', 'Snakemake', 'config', 'config.yaml'), 'r') as f:
+    with open(os.path.join(snakemake_dir, 'config', 'config.yaml'), 'r') as f:
         settings = yaml.safe_load(f)
     print('output_data:', settings['output_data'], flush=True)
 
@@ -776,13 +779,28 @@ def snakemake_pipeline(par):
 
     # Run pipeline
     print('run snakemake ', flush=True)
-    
+
     with contextlib.chdir(snakemake_dir):
+        # this fails to download atumatically so we do manually
+        if True:
+            url = "https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes"
+            response = requests.get(url)
+            with open('hg38.chrom.sizes', 'wb') as file:
+                file.write(response.content)
+
         subprocess.run([
             f'snakemake',
-            '--cores', str(par['num_workers']),
-            #'--unlock'
+            'touch'
         ])
+
+        subprocess.run([
+            f'snakemake',
+            '--unlock'
+        ])
+
+        subprocess.run([
+            f'snakemake',
+            '--cores', str(par['num_workers'])])
 
 def post_process(par):
     scplus_mdata = mudata.read(par['scplus_mdata'])

diff --git a/src/methods/multi_omics/scenicplus/script.py b/src/methods/multi_omics/scenicplus/script.py
@@ -69,22 +69,22 @@ def main(par):
     par['MALLET_PATH'] = os.path.join(par['temp_dir'], 'Mallet-202108', 'bin', 'mallet')
     os.makedirs(par['atac_dir'], exist_ok=True)
 
-    print('------- download_databases -------')
-    download_databases(par)
-    print_memory_usage()
-    print('------- process_peak -------')
-    process_peak(par)
-    print_memory_usage()
-    print('------- run_cistopic -------')
-    run_cistopic(par)
-    print_memory_usage()
-    print('------- process_topics -------')
-    process_topics(par)
-    print_memory_usage()
-    print('------- preprocess_rna -------')
-    preprocess_rna(par)
-    print_memory_usage()
-    print('------- snakemake_pipeline -------')
+    # print('------- download_databases -------')
+    # download_databases(par)
+    # print_memory_usage()
+    # print('------- process_peak -------')
+    # process_peak(par)
+    # print_memory_usage()
+    # print('------- run_cistopic -------')
+    # run_cistopic(par)
+    # print_memory_usage()
+    # print('------- process_topics -------')
+    # process_topics(par)
+    # print_memory_usage()
+    # print('------- preprocess_rna -------')
+    # preprocess_rna(par)
+    # print_memory_usage()
+    # print('------- snakemake_pipeline -------')
     snakemake_pipeline(par)
     print_memory_usage()
     print('------- post_process -------')

diff --git a/src/methods/multi_omics/scglue/main.py b/src/methods/multi_omics/scglue/main.py
@@ -264,12 +264,12 @@ def prune_grn(par):
         "--output", f"{par['temp_dir']}/pruned_grn.csv",
         "--top_n_targets", str(par['top_n_targets']),
         "--rank_threshold", str(par['rank_threshold']),
-        "--auc_threshold", "0",
+        "--auc_threshold", ".5",
         "--nes_threshold", "0", 
         "--min_genes", "1",
         "--thresholds", "0.5", "0.7", 
         "--top_n_regulators", "10", "50", "100",
-        "--max_similarity_fdr", "0.1",
+        "--max_similarity_fdr", "0.2",
         "--num_workers", f"{par['num_workers']}",
         "--cell_id_attribute", "obs_id", # be sure that obs_id is in obs and name is in var
         "--gene_attribute", "name"

diff --git a/src/methods/single_omics/ppcor/script.R b/src/methods/single_omics/ppcor/script.R
@@ -4,14 +4,16 @@ library(dplyr)
 
 ## VIASH START
 par <- list(
-  multiomics_rna = 'resources/grn-benchmark/multiomics_rna.h5ad',
+  multiomics_rna = 'resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad',
   prediction = 'resources/grn_models/ppcor.csv',
+  tf_all = 'resources/prior/tf_all.csv',
   max_n_links = 50000
 )
 ## VIASH END
 print(par)
-print(dim(par))
 # input expression data
+tf_names <- scan(par$tf_all, what = "", sep = "\n")
+
 ad <- anndata::read_h5ad(par$multiomics_rna)
 
 inputExpr <- ad$X
@@ -34,6 +36,9 @@ df <- df[order(df$weight, decreasing=TRUE),]
 # Remove self-interactions
 df_filtered <- df[df$source != df$target,]
 
+# Filter to keep only connections where the source is a TF
+df_filtered <- df_filtered %>% filter(source %in% tf_names)
+
 # Reset index
 df_filtered$index = 1:nrow(df_filtered)
 

diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py
@@ -252,7 +252,7 @@ def static_approach(
             if n_features[n_features_dict[gene_name]] != 0:
                 score = res['scores'][i]['avg-r2']
                 if clip_scores:
-                score = np.clip(score, 0, 1)
+                    score = np.clip(score, 0, 1)
                 r2.append(score)
         if len(r2)==0:
             raise ValueError('R2 score is empty')

diff --git a/src/metrics/script_all.py b/src/metrics/script_all.py
@@ -60,29 +60,33 @@
 
 os.makedirs(par['scores_dir'], exist_ok=True)
 
-for dataset_type in ['full', 'hvg']:
-  for apply_skeleton in [False, True]:
-    os.makedirs(f"{par['scores_dir']}/{dataset_type}/skeleton_{apply_skeleton}", exist_ok=True)
-    for layer in par['layers']:
-      par['layer'] = layer
-      i = 0
-      for method in par['methods']:
-        print(method)
-        par['prediction'] = f"{par['models_dir']}/{dataset_type}/{method}.csv"
-        if not os.path.exists(par['prediction']):
-          print(f"{par['prediction']} doesnt exist. Skipped.")
-          continue
-        from regression_1.main import main 
-        reg1 = main(par)
-        from regression_2.main import main 
-        reg2 = main(par)
-        score = pd.concat([reg1, reg2], axis=1)
-        score.index = [method]
-        if i==0:
-          df_all = score
-        else:
-          df_all = pd.concat([df_all, score])
-        df_all.to_csv(f"{par['scores_dir']}/{dataset_type}/skeleton_{apply_skeleton}/{layer}-{par['reg_type']}.csv")
-        print(df_all)
-        i+=1
+for max_n_links in [10000, 50000]:
+  par['max_n_links'] = max_n_links
+  for dataset_type in ['full', 'hvg']:
+    for apply_skeleton in [False, True]:
+      par['apply_skeleton'] = apply_skeleton
+      save_dir = f"{par['scores_dir']}/{dataset_type}/{max_n_links}/skeleton_{apply_skeleton}"
+      os.makedirs(save_dir, exist_ok=True)
+      for layer in par['layers']:
+        par['layer'] = layer
+        i = 0
+        for method in par['methods']:
+          print(method)
+          par['prediction'] = f"{par['models_dir']}/{dataset_type}/{method}.csv"
+          if not os.path.exists(par['prediction']):
+            print(f"{par['prediction']} doesnt exist. Skipped.")
+            continue
+          from regression_1.main import main 
+          reg1 = main(par)
+          from regression_2.main import main 
+          reg2 = main(par)
+          score = pd.concat([reg1, reg2], axis=1)
+          score.index = [method]
+          if i==0:
+            df_all = score
+          else:
+            df_all = pd.concat([df_all, score])
+          df_all.to_csv(f"{save_dir}/{layer}-{par['reg_type']}.csv")
+          print(df_all)
+          i+=1