Skip to content

Commit

Permalink
scenic+ was modified to prevent biomart issue.
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Oct 11, 2024
1 parent 0b42bb9 commit 68ff6c4
Show file tree
Hide file tree
Showing 12 changed files with 643 additions and 75 deletions.
533 changes: 523 additions & 10 deletions runs.ipynb

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions scripts/sbatch/figr.sh → scripts/sbatch/r_script.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
#SBATCH --job-name=figr
#SBATCH --job-name=r_model
#SBATCH --time=48:00:00
#SBATCH --output=logs/%j.out
#SBATCH --error=logs/%j.err
Expand All @@ -9,4 +9,5 @@
#SBATCH --mem=250G


singularity run ../../images/figr Rscript src/methods/multi_omics/figr/script.R
# singularity run ../../images/figr Rscript src/methods/multi_omics/figr/script.R
singularity run ../../images/ppcor Rscript src/methods/single_omics/ppcor/script.R
1 change: 1 addition & 0 deletions src/control_methods/pearson/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from util import create_corr_net

par['causal'] = True
par['normalize'] = True
net = create_corr_net(par)
print('Output GRN')
net.to_csv(par['prediction'])
2 changes: 2 additions & 0 deletions src/control_methods/positive_control/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@

print('Create causal corr net')
par['causal'] = True
par['normalize'] = True

par['multiomics_rna'] = par['perturbation_data']
net = create_corr_net(par)

Expand Down
6 changes: 3 additions & 3 deletions src/exp_analysis/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def calculate_feature_distance(sample: pd.DataFrame, control: pd.DataFrame, top_

distance_df = pd.DataFrame({'distance_raw':distance_raw, 'distance_rank':distance_rank}, index=entries_common)
return distance_df
def cosine_similarity(nets_dict, col_name='source', weight_col='weight', figsize=(4, 4), title='Cosine Similarity', ax=None):
def cosine_similarity_net(nets_dict, col_name='source', weight_col='weight', figsize=(4, 4), title='Cosine Similarity', ax=None):
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
Expand Down Expand Up @@ -99,14 +99,14 @@ def cosine_similarity(nets_dict, col_name='source', weight_col='weight', figsize

sns.heatmap(cosine_sim_matrix, annot=True, cmap="coolwarm", xticklabels=nets_names, yticklabels=nets_names, ax=ax)
ax.grid(False)
ax.set_title(title)
ax.set_title(title, pad=20)

# Rotate x labels for readability
plt.xticks(rotation=45, ha='right')

return cosine_sim_matrix, fig

def jaccard_similarity(nets_dict, col_name='link', figsize=(4, 4), title='jaccard Similarity', ax=None):
def jaccard_similarity_net(nets_dict, col_name='link', figsize=(4, 4), title='jaccard Similarity', ax=None):
from itertools import combinations
import seaborn as sns
import matplotlib.pyplot as plt
Expand Down
34 changes: 29 additions & 5 deletions src/utils/helper.py → src/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import anndata as ad
import numpy as np
import scanpy as sc
from src.exp_analysis.helper import plot_cumulative_density
import sys
import subprocess
import io
Expand Down Expand Up @@ -41,6 +40,15 @@ def check_scores(par):
df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))
df_scores['rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)
return df_scores

def calculate_scores():
command = [
"sbatch",
"scripts/sbatch/calculate_scores.sh"
]

# Print command to verify
subprocess.run(command)


def run_consensus(par):
Expand All @@ -55,7 +63,7 @@ def run_consensus(par):
# Print command to verify
subprocess.run(command)

def run_grn_inference_seqera():
def run_grn_seqera():
# if False: # submit the job
# !bash src/methods/multi_omics/celloracle/run.sh
# if False: # get the results
Expand All @@ -74,13 +82,22 @@ def run_grn_inference_seqera():
raise ValueError('define first')

def run_grn_inference():
# par = {
# 'methods': ['scglue'],
# 'models_dir': 'resources/grn_models/',
# 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad',
# 'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad',
# 'num_workers': 20,
# 'mem': "120GB",
# 'time': "48:00:00"
# }
par = {
'methods': ['scglue'],
'methods': ['scenicplus'],
'models_dir': 'resources/grn_models/',
'multiomics_rna': 'resources/grn-benchmark/multiomics_rna.h5ad',
'multiomics_atac': 'resources/grn-benchmark/multiomics_atac.h5ad',
'num_workers': 20,
'mem': "120GB",
'mem': "250GB",
'time': "48:00:00"
}

Expand Down Expand Up @@ -126,6 +143,8 @@ def run_grn_inference():
# Run sbatch command
try:
result = subprocess.run(['sbatch'] + full_tag + ['scripts/sbatch/grn_inference.sh', command], check=True, capture_output=True, text=True)
# result = subprocess.run(['bash'] + ['scripts/sbatch/grn_inference.sh', command], check=True, capture_output=True, text=True)

print(f"Job {method} submitted successfully.")
print(result.stdout) # Print the standard output
except subprocess.CalledProcessError as e:
Expand Down Expand Up @@ -297,4 +316,9 @@ def elapsed_to_hours(elapsed_str):
# Convert MaxRSS and MaxVMSize from KB to GB
df_local['MaxRSS'] = df_local['MaxRSS'] / (1024 ** 2) # Convert KB to GB
df_local['MaxVMSize'] = df_local['MaxVMSize'] / (1024 ** 2) # Convert KB to GB
return df_local
return df_local


if __name__ == '__main__':
# run_grn_inference()
calculate_scores()
36 changes: 27 additions & 9 deletions src/methods/multi_omics/scenicplus/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -703,17 +703,20 @@ def preprocess_rna(par):
def snakemake_pipeline(par):
import os
snakemake_dir = os.path.join(par['temp_dir'], 'scplus_pipeline', 'Snakemake')
if os.path.exists(snakemake_dir):
import shutil
shutil.rmtree(snakemake_dir) # Replace 'snakemake_dir' with your directory path
# if os.path.exists(snakemake_dir):
# import shutil
# shutil.rmtree(snakemake_dir)

os.makedirs(os.path.join(par['temp_dir'], 'scplus_pipeline'), exist_ok=True)
os.makedirs(os.path.join(par['temp_dir'], 'scplus_pipeline', 'temp'), exist_ok=True)
subprocess.run(['scenicplus', 'init_snakemake', '--out_dir', os.path.join(par['temp_dir'], 'scplus_pipeline')])
print('snake make initialized', flush=True)

pipeline_dir = os.path.join(par['temp_dir'], 'scplus_pipeline')
if not os.path.exists(pipeline_dir):
subprocess.run(['scenicplus', 'init_snakemake', '--out_dir', pipeline_dir])
print('snake make initialized', flush=True)

# Load pipeline settings
with open(os.path.join(par['temp_dir'], 'scplus_pipeline', 'Snakemake', 'config', 'config.yaml'), 'r') as f:
with open(os.path.join(snakemake_dir, 'config', 'config.yaml'), 'r') as f:
settings = yaml.safe_load(f)
print('output_data:', settings['output_data'], flush=True)

Expand Down Expand Up @@ -776,13 +779,28 @@ def snakemake_pipeline(par):

# Run pipeline
print('run snakemake ', flush=True)

with contextlib.chdir(snakemake_dir):
# this fails to download atumatically so we do manually
if True:
url = "https://hgdownload.cse.ucsc.edu/goldenpath/hg38/bigZips/hg38.chrom.sizes"
response = requests.get(url)
with open('hg38.chrom.sizes', 'wb') as file:
file.write(response.content)

subprocess.run([
f'snakemake',
'--cores', str(par['num_workers']),
#'--unlock'
'touch'
])

subprocess.run([
f'snakemake',
'--unlock'
])

subprocess.run([
f'snakemake',
'--cores', str(par['num_workers'])])

def post_process(par):
scplus_mdata = mudata.read(par['scplus_mdata'])
Expand Down
32 changes: 16 additions & 16 deletions src/methods/multi_omics/scenicplus/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,22 +69,22 @@ def main(par):
par['MALLET_PATH'] = os.path.join(par['temp_dir'], 'Mallet-202108', 'bin', 'mallet')
os.makedirs(par['atac_dir'], exist_ok=True)

print('------- download_databases -------')
download_databases(par)
print_memory_usage()
print('------- process_peak -------')
process_peak(par)
print_memory_usage()
print('------- run_cistopic -------')
run_cistopic(par)
print_memory_usage()
print('------- process_topics -------')
process_topics(par)
print_memory_usage()
print('------- preprocess_rna -------')
preprocess_rna(par)
print_memory_usage()
print('------- snakemake_pipeline -------')
# print('------- download_databases -------')
# download_databases(par)
# print_memory_usage()
# print('------- process_peak -------')
# process_peak(par)
# print_memory_usage()
# print('------- run_cistopic -------')
# run_cistopic(par)
# print_memory_usage()
# print('------- process_topics -------')
# process_topics(par)
# print_memory_usage()
# print('------- preprocess_rna -------')
# preprocess_rna(par)
# print_memory_usage()
# print('------- snakemake_pipeline -------')
snakemake_pipeline(par)
print_memory_usage()
print('------- post_process -------')
Expand Down
4 changes: 2 additions & 2 deletions src/methods/multi_omics/scglue/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,12 +264,12 @@ def prune_grn(par):
"--output", f"{par['temp_dir']}/pruned_grn.csv",
"--top_n_targets", str(par['top_n_targets']),
"--rank_threshold", str(par['rank_threshold']),
"--auc_threshold", "0",
"--auc_threshold", ".5",
"--nes_threshold", "0",
"--min_genes", "1",
"--thresholds", "0.5", "0.7",
"--top_n_regulators", "10", "50", "100",
"--max_similarity_fdr", "0.1",
"--max_similarity_fdr", "0.2",
"--num_workers", f"{par['num_workers']}",
"--cell_id_attribute", "obs_id", # be sure that obs_id is in obs and name is in var
"--gene_attribute", "name"
Expand Down
9 changes: 7 additions & 2 deletions src/methods/single_omics/ppcor/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@ library(dplyr)

## VIASH START
par <- list(
multiomics_rna = 'resources/grn-benchmark/multiomics_rna.h5ad',
multiomics_rna = 'resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad',
prediction = 'resources/grn_models/ppcor.csv',
tf_all = 'resources/prior/tf_all.csv',
max_n_links = 50000
)
## VIASH END
print(par)
print(dim(par))
# input expression data
tf_names <- scan(par$tf_all, what = "", sep = "\n")

ad <- anndata::read_h5ad(par$multiomics_rna)

inputExpr <- ad$X
Expand All @@ -34,6 +36,9 @@ df <- df[order(df$weight, decreasing=TRUE),]
# Remove self-interactions
df_filtered <- df[df$source != df$target,]

# Filter to keep only connections where the source is a TF
df_filtered <- df_filtered %>% filter(source %in% tf_names)

# Reset index
df_filtered$index = 1:nrow(df_filtered)

Expand Down
2 changes: 1 addition & 1 deletion src/metrics/regression_2/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def static_approach(
if n_features[n_features_dict[gene_name]] != 0:
score = res['scores'][i]['avg-r2']
if clip_scores:
score = np.clip(score, 0, 1)
score = np.clip(score, 0, 1)
r2.append(score)
if len(r2)==0:
raise ValueError('R2 score is empty')
Expand Down
54 changes: 29 additions & 25 deletions src/metrics/script_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,29 +60,33 @@

os.makedirs(par['scores_dir'], exist_ok=True)

for dataset_type in ['full', 'hvg']:
for apply_skeleton in [False, True]:
os.makedirs(f"{par['scores_dir']}/{dataset_type}/skeleton_{apply_skeleton}", exist_ok=True)
for layer in par['layers']:
par['layer'] = layer
i = 0
for method in par['methods']:
print(method)
par['prediction'] = f"{par['models_dir']}/{dataset_type}/{method}.csv"
if not os.path.exists(par['prediction']):
print(f"{par['prediction']} doesnt exist. Skipped.")
continue
from regression_1.main import main
reg1 = main(par)
from regression_2.main import main
reg2 = main(par)
score = pd.concat([reg1, reg2], axis=1)
score.index = [method]
if i==0:
df_all = score
else:
df_all = pd.concat([df_all, score])
df_all.to_csv(f"{par['scores_dir']}/{dataset_type}/skeleton_{apply_skeleton}/{layer}-{par['reg_type']}.csv")
print(df_all)
i+=1
for max_n_links in [10000, 50000]:
par['max_n_links'] = max_n_links
for dataset_type in ['full', 'hvg']:
for apply_skeleton in [False, True]:
par['apply_skeleton'] = apply_skeleton
save_dir = f"{par['scores_dir']}/{dataset_type}/{max_n_links}/skeleton_{apply_skeleton}"
os.makedirs(save_dir, exist_ok=True)
for layer in par['layers']:
par['layer'] = layer
i = 0
for method in par['methods']:
print(method)
par['prediction'] = f"{par['models_dir']}/{dataset_type}/{method}.csv"
if not os.path.exists(par['prediction']):
print(f"{par['prediction']} doesnt exist. Skipped.")
continue
from regression_1.main import main
reg1 = main(par)
from regression_2.main import main
reg2 = main(par)
score = pd.concat([reg1, reg2], axis=1)
score.index = [method]
if i==0:
df_all = score
else:
df_all = pd.concat([df_all, score])
df_all.to_csv(f"{save_dir}/{layer}-{par['reg_type']}.csv")
print(df_all)
i+=1

0 comments on commit 68ff6c4

Please sign in to comment.