scgpt download datasets bug fixed

openproblems-bio · Nov 19, 2024 · a3656b4 · a3656b4
1 parent 403533f
commit a3656b4
Show file tree

Hide file tree

Showing 9 changed files with 869 additions and 825 deletions.
diff --git a/runs.ipynb b/runs.ipynb
diff --git a/src/methods/multi_omics/celloracle/script.py b/src/methods/multi_omics/celloracle/script.py
@@ -42,7 +42,7 @@
     meta['resources_dir'] = args.resources_dir   
 
 try:
-    meta['resources_dir'] =par['resources_dir ']
+    meta['resources_dir'] = par['resources_dir']
 except:
     pass
 

diff --git a/src/methods/single_omics/scgpt/script.py b/src/methods/single_omics/scgpt/script.py
@@ -64,14 +64,20 @@
 par['vocab_file'] = f"{par['temp_dir']}/vocab.json"
 
 
-command = f"wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1CPVtpWUJ2nkI9jGignlHLcefBe6Gk-F9' -O {par['model_file']}"
-subprocess.run(command, shell=True, check=True)
-
-command = f"wget --no-check-certificate 'https://drive.google.com/file/d/1Qzb6Y9UB342a2QxmY-BCubSvcmYZ5jw3/view?usp=drive_link' -O {par['vocab_file']}"
-subprocess.run(command, shell=True, check=True)
-
-command = f"wget --no-check-certificate 'https://drive.google.com/file/d/1VwPGHuSorVAXyTreMFI1yzMougtUDeUt/view?usp=drive_link' -O {par['model_config_file']}"
-subprocess.run(command, shell=True, check=True)
+import requests
+def download_file(output_file, url):
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        with open(output_file, "wb") as f:
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+        print(f"File downloaded successfully and saved to {output_file}")
+    else:
+        print(f"Failed to download file. HTTP status code: {response.status_code}")
+download_file(par['model_file'], 'https://drive.google.com/uc?export=download&id=1CPVtpWUJ2nkI9jGignlHLcefBe6Gk-F9')
+download_file(par['vocab_file'], 'https://drive.google.com/file/d/1Qzb6Y9UB342a2QxmY-BCubSvcmYZ5jw3/view?usp=drive_link')
+download_file(par['model_config_file'], 'https://drive.google.com/file/d/1VwPGHuSorVAXyTreMFI1yzMougtUDeUt/view?usp=drive_link')
 
 
 # os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:50"

diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py
@@ -72,10 +72,10 @@ def cross_validate_gene(
         return results
 
     # Feature selection
-    scores = np.abs(grn[:, j])
-    scores[j] = -1
-    selected_features = np.argsort(scores)[-n_features:]
-    selected_features = selected_features[scores[selected_features] > 0]
+    regulatory_importance = np.abs(grn[:, j])
+    regulatory_importance[j] = -1
+    selected_features = np.argsort(regulatory_importance)[-n_features:]
+    selected_features = selected_features[regulatory_importance[selected_features] > 0]
     if len(selected_features) == 0:
         return results
     assert j not in selected_features
@@ -242,8 +242,7 @@ def static_approach(
         gene_names: List[str],
         tf_names: Set[str],
         reg_type: str,
-        n_jobs:int,
-        n_features_dict:dict
+        n_jobs:int
 ) -> float:
 
     # Cross-validate each gene using the inferred GRN to define select input features
@@ -322,7 +321,6 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
         with open(par['consensus'], 'r') as f:
             data = json.load(f)
         gene_names_ = np.asarray(list(data.keys()), dtype=object)
-        n_features_dict = {gene_name: i for i, gene_name in enumerate(gene_names_)}
 
         n_features_theta_min = np.asarray([data[gene_name]['0'] for gene_name in gene_names], dtype=int)
         n_features_theta_median = np.asarray([data[gene_name]['0.5'] for gene_name in gene_names], dtype=int)
@@ -335,18 +333,17 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
 
         # Evaluate GRN
         verbose_print(par['verbose'], f'Compute metrics for layer: {layer}', 3)
-        # print(f'Dynamic approach:', flush=True)
         verbose_print(par['verbose'], f'Static approach (theta=0):', 3)
-        score_static_min = static_approach(net_matrix, n_features_theta_min, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'], n_features_dict=n_features_dict)
+        score_static_min = static_approach(net_matrix, n_features_theta_min, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'])
         verbose_print(par['verbose'], f'Static approach (theta=0.5):', 3)
-        score_static_median = static_approach(net_matrix, n_features_theta_median, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'], n_features_dict=n_features_dict)
-        # print(f'Static approach (theta=1):', flush=True)
-        # score_static_max = static_approach(net_matrix, n_features_theta_max, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'], n_features_dict=n_features_dict)
+        score_static_median = static_approach(net_matrix, n_features_theta_median, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'])
+        print(f'Static approach (theta=1):', flush=True)
+        score_static_max = static_approach(net_matrix, n_features_theta_max, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'])
 
         results = {
             'static-theta-0.0': [float(score_static_min)],
             'static-theta-0.5': [float(score_static_median)],
-            # 'static-theta-1.0': [float(score_static_max)],
+            'static-theta-1.0': [float(score_static_max)],
         }
 
         # # Add dynamic score

diff --git a/src/metrics/script_all.py b/src/metrics/script_all.py
@@ -16,7 +16,7 @@ def define_par(dataset):
     raise ValueError('define first')
 
   par = {
-      'reg_type': 'GB',
+      'reg_type': 'ridge',
       'models_dir': f"resources/grn_models/{dataset}",
       'scores_dir': f"output/temp/{dataset}",
 

diff --git a/src/process_data/multiomics/subset_hvg/config.vsh.yaml b/src/process_data/multiomics/subset_hvg/config.vsh.yaml
diff --git a/src/process_data/multiomics/subset_hvg/script.py b/src/process_data/multiomics/subset_hvg/script.py
diff --git a/src/process_data/perturbation/normalization/config.vsh.yaml b/src/process_data/perturbation/normalization/config.vsh.yaml
@@ -48,7 +48,6 @@ functionality:
               required: true
       required: false
       direction: output
-      example: resources_test/evaluation_datasets/op_perturbation.h5ad 
 
 
   resources:

diff --git a/src/process_data/qc/script.py b/src/process_data/qc/script.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -42,7 +42,7 @@ @@
         meta['resources_dir'] = args.resources_dir
     try:
-        meta['resources_dir'] =par['resources_dir ']
+        meta['resources_dir'] = par['resources_dir']
     except:
         pass
@@ Expand Down @@