Finish integrating AlphaFold 3

BioinfoMachineLearning · Dec 10, 2024 · b0f9121 · b0f9121
1 parent 9343142
commit b0f9121
Show file tree

Hide file tree

Showing 13 changed files with 229 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -861,48 +861,48 @@ python3 posebench/data/af3_output_extraction.py dataset=casp15 repeat_index=1
 Relax the generated ligand structures inside of their respective protein pockets
 
 ```bash
-python3 posebench/models/inference_relaxation.py method=chai-lab dataset=posebusters_benchmark remove_initial_protein_hydrogens=true repeat_index=1
+python3 posebench/models/inference_relaxation.py method=alphafold3 dataset=posebusters_benchmark remove_initial_protein_hydrogens=true repeat_index=1
 ...
-python3 posebench/models/inference_relaxation.py method=chai-lab dataset=astex_diverse remove_initial_protein_hydrogens=true repeat_index=1
+python3 posebench/models/inference_relaxation.py method=alphafold3 dataset=astex_diverse remove_initial_protein_hydrogens=true repeat_index=1
 ...
-python3 posebench/models/inference_relaxation.py method=chai-lab dataset=dockgen remove_initial_protein_hydrogens=true repeat_index=1
+python3 posebench/models/inference_relaxation.py method=alphafold3 dataset=dockgen remove_initial_protein_hydrogens=true repeat_index=1
 ...
 ```
 
 Align predicted protein-ligand structures to ground-truth complex structures
 
 ```bash
 conda activate PyMOL-PoseBench
-python3 posebench/analysis/complex_alignment.py method=chai-lab dataset=posebusters_benchmark repeat_index=1
+python3 posebench/analysis/complex_alignment.py method=alphafold3 dataset=posebusters_benchmark repeat_index=1
 ...
-python3 posebench/analysis/complex_alignment.py method=chai-lab dataset=astex_diverse repeat_index=1
+python3 posebench/analysis/complex_alignment.py method=alphafold3 dataset=astex_diverse repeat_index=1
 ...
-python3 posebench/analysis/complex_alignment.py method=chai-lab dataset=dockgen repeat_index=1
+python3 posebench/analysis/complex_alignment.py method=alphafold3 dataset=dockgen repeat_index=1
 conda deactivate
 ...
 ```
 
 Analyze inference results for each dataset
 
 ```bash
-python3 posebench/analysis/inference_analysis.py method=chai-lab dataset=posebusters_benchmark repeat_index=1
+python3 posebench/analysis/inference_analysis.py method=alphafold3 dataset=posebusters_benchmark repeat_index=1
 ...
-python3 posebench/analysis/inference_analysis.py method=chai-lab dataset=astex_diverse repeat_index=1
+python3 posebench/analysis/inference_analysis.py method=alphafold3 dataset=astex_diverse repeat_index=1
 ...
-python3 posebench/analysis/inference_analysis.py method=chai-lab dataset=dockgen repeat_index=1
+python3 posebench/analysis/inference_analysis.py method=alphafold3 dataset=dockgen repeat_index=1
 ...
 ```
 
 Analyze inference results for the CASP15 dataset
 
 ```bash
 # first assemble (unrelaxed and post ranking-relaxed) CASP15-compliant prediction submission files for scoring
-python3 posebench/models/ensemble_generation.py ensemble_methods=\[chai-lab\] input_csv_filepath=data/test_cases/casp15/ensemble_inputs.csv output_dir=data/test_cases/casp15/top_chai-lab_ensemble_predictions_1 skip_existing=true relax_method_ligands_post_ranking=false export_file_format=casp15 export_top_n=5 combine_casp_output_files=true max_method_predictions=5 method_top_n_to_select=5 resume=true ensemble_benchmarking=true ensemble_benchmarking_dataset=casp15 cuda_device_index=0 ensemble_benchmarking_repeat_index=1
-python3 posebench/models/ensemble_generation.py ensemble_methods=\[chai-lab\] input_csv_filepath=data/test_cases/casp15/ensemble_inputs.csv output_dir=data/test_cases/casp15/top_chai-lab_ensemble_predictions_1 skip_existing=true relax_method_ligands_post_ranking=true export_file_format=casp15 export_top_n=5 combine_casp_output_files=true max_method_predictions=5 method_top_n_to_select=5 resume=true ensemble_benchmarking=true ensemble_benchmarking_dataset=casp15 cuda_device_index=0 ensemble_benchmarking_repeat_index=1
+python3 posebench/models/ensemble_generation.py ensemble_methods=\[alphafold3\] input_csv_filepath=data/test_cases/casp15/ensemble_inputs.csv output_dir=data/test_cases/casp15/top_alphafold3_ensemble_predictions_1 skip_existing=true relax_method_ligands_post_ranking=false export_file_format=casp15 export_top_n=5 combine_casp_output_files=true max_method_predictions=5 method_top_n_to_select=5 resume=true ensemble_benchmarking=true ensemble_benchmarking_dataset=casp15 cuda_device_index=0 ensemble_benchmarking_repeat_index=1
+python3 posebench/models/ensemble_generation.py ensemble_methods=\[alphafold3\] input_csv_filepath=data/test_cases/casp15/ensemble_inputs.csv output_dir=data/test_cases/casp15/top_alphafold3_ensemble_predictions_1 skip_existing=true relax_method_ligands_post_ranking=true export_file_format=casp15 export_top_n=5 combine_casp_output_files=true max_method_predictions=5 method_top_n_to_select=5 resume=true ensemble_benchmarking=true ensemble_benchmarking_dataset=casp15 cuda_device_index=0 ensemble_benchmarking_repeat_index=1
 # NOTE: the suffixes for both `output_dir` and `ensemble_benchmarking_repeat_index` should be modified to e.g., 2, 3, ...
 ...
 # now score the CASP15-compliant submissions using the official CASP scoring pipeline
-python3 posebench/analysis/inference_analysis_casp.py method=chai-lab dataset=casp15 repeat_index=1
+python3 posebench/analysis/inference_analysis_casp.py method=alphafold3 dataset=casp15 repeat_index=1
 ...
 ```
 

diff --git a/configs/analysis/complex_alignment.yaml b/configs/analysis/complex_alignment.yaml
@@ -1,5 +1,5 @@
-method: neuralplexer # the method for which to align predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`)
-vina_binding_site_method: p2rank # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`)
+method: neuralplexer # the method for which to align predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `alphafold3`)
+vina_binding_site_method: p2rank # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `alphafold3`, `p2rank`)
 dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse

diff --git a/configs/analysis/inference_analysis.yaml b/configs/analysis/inference_analysis.yaml
@@ -1,6 +1,6 @@
 full_report: true # whether to generate a full PoseBusters report (i.e. with all metrics) or a summary report (i.e. with only the most important metrics)
-method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `vina`, `ensemble`)
-vina_binding_site_method: p2rank # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `p2rank`)
+method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `alphafold3`, `vina`, `ensemble`)
+vina_binding_site_method: p2rank # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `alphafold3`, `p2rank`)
 dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 input_csv_path: ${resolve_method_input_csv_path:${method},${dataset},${pocket_only_baseline}} # the input CSV filepath with which to run inference

diff --git a/configs/analysis/inference_analysis_casp.yaml b/configs/analysis/inference_analysis_casp.yaml
@@ -1,8 +1,8 @@
 full_report: true # whether to generate a full PoseBusters report (i.e. with all metrics) or a summary report (i.e. with only the most important metrics)
 python_exec_path: ${oc.env:HOME}/mambaforge/envs/casp15_ligand_scoring/bin/python3 # the Python executable to use
 scoring_script_path: ${oc.env:PROJECT_ROOT}/posebench/analysis/casp15_ligand_scoring/score_predictions.py # the path to the script to use for scoring CASP predictions
-method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `vina`, `ensemble`, `tulip`)
-vina_binding_site_method: p2rank # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`)
+method: diffdock # the method for which to score predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `alphafold3`, `vina`, `ensemble`, `tulip`)
+vina_binding_site_method: p2rank # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `alphafold3`, `p2rank`)
 dataset: casp15 # the dataset to use - NOTE: must be one of (`casp15`)
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 predictions_dir: ${oc.env:PROJECT_ROOT}/data/test_cases/${dataset}/top_${method}_ensemble_predictions_${repeat_index} # the directory containing the predictions to analyze

diff --git a/configs/model/ensemble_generation.yaml b/configs/model/ensemble_generation.yaml
@@ -1,5 +1,5 @@
 # General inference arguments:
-ensemble_methods: [diffdock, dynamicbind, neuralplexer, rfaa] # the methods from which to gather predictions for ensembling - NOTE: must be one of (`diffdock`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `vina`, `tulip`)
+ensemble_methods: [diffdock, dynamicbind, neuralplexer, rfaa] # the methods from which to gather predictions for ensembling - NOTE: must be one of (`diffdock`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `alphafold3`, `vina`, `tulip`)
 generate_vina_scripts: false # whether to generate Vina scripts using other methods' binding site predictions - NOTE: `resume` must also be `true` when this is `true`, meaning other methods' predictions must have already been generated locally
 rank_single_method_intrinsically: true # whether to rank single-method predictions using either `consensus` or `vina` ranking (false) or instead using their intrinsic (explicit) rank assignment (true)
 output_bash_file_dir: ensemble_generation_scripts # the directory in which to save the generated Bash scripts
@@ -146,6 +146,8 @@ rfaa_inference_dir_name: null # the name of the inference output directory to us
 # Chai-1 inference arguments:
 chai_out_path: ${oc.env:PROJECT_ROOT}/forks/chai-lab/inference/chai-lab_ensemble_outputs # the output directory to which to write the predictions
 chai_skip_existing: true # whether to skip running inference if the prediction for a target already exists
+# AlphaFold 3 inference arguments:
+alphafold3_out_path: ${oc.env:PROJECT_ROOT}/forks/alphafold3/inference/alphafold3_ensemble_outputs # the output directory to which to write the predictions
 # Vina inference arguments:
 vina_binding_site_methods: [p2rank] # the methods to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `dynamicbind`, `neuralplexer`, `flowdock`, `p2rank`)
 vina_python2_exec_path: ${oc.env:PROJECT_ROOT}/forks/Vina/ADFR/bin/python # the path to the Python 2 executable

diff --git a/configs/model/inference_relaxation.yaml b/configs/model/inference_relaxation.yaml
@@ -1,5 +1,5 @@
-method: diffdock # the method for which to relax predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `vina`, `tulip`)
-vina_binding_site_method: p2rank # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `p2rank`)
+method: diffdock # the method for which to relax predictions - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `alphafold3`, `vina`, `tulip`)
+vina_binding_site_method: p2rank # the method to use for Vina binding site prediction - NOTE: must be one of (`diffdock`, `fabind`, `dynamicbind`, `neuralplexer`, `flowdock`, `rfaa`, `chai-lab`, `alphafold3`, `p2rank`)
 dataset: posebusters_benchmark # the dataset for which to relax predictions - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
 ensemble_ranking_method: consensus # the method with which to rank-order and select the top ensemble prediction for each target - NOTE: must be one of (`consensus`, `ff`)
 num_processes: 1 # the number of parallel processes to use for relaxation

diff --git a/docs/source/configs/data.rst b/docs/source/configs/data.rst
@@ -93,6 +93,12 @@ Chai-1 output extraction
     :language: yaml
     :caption: :file:`data/chai_output_extraction.yaml`
 
+AlphaFold 3 output extraction
+^^^^^^^^^^^^^^^^^^^^^^^^
+.. literalinclude:: ../../../configs/data/alphafold3_output_extraction.yaml
+    :language: yaml
+    :caption: :file:`data/alphafold3_output_extraction.yaml`
+
 TULIP output extraction
 ^^^^^^^^^^^^^^^^^^^^^^^^
 .. literalinclude:: ../../../configs/data/tulip_output_extraction.yaml

diff --git a/posebench/__init__.py b/posebench/__init__.py
@@ -16,6 +16,7 @@
     "flowdock": "FlowDock",
     "rfaa": "RoseTTAFold-All-Atom",
     "chai-lab": "chai-lab",
+    "alphafold3": "AlphaFold 3",
     "vina": "Vina",
     "tulip": "TULIP",
     "p2rank": "P2Rank",
@@ -100,7 +101,7 @@ def resolve_method_protein_dir(
             "results",
             f"{dataset}{pocket_only_suffix}",
         )
-    elif method in ["neuralplexer", "flowdock", "rfaa", "chai-lab"]:
+    elif method in ["neuralplexer", "flowdock", "rfaa", "chai-lab", "alphafold3"]:
         return os.path.join(
             "forks",
             METHOD_TITLE_MAPPING.get(method, method),
@@ -143,10 +144,13 @@ def resolve_method_ligand_dir(
         "flowdock",
         "rfaa",
         "chai-lab",
+        "alphafold3",
         "tulip",
     ]:
         output_suffix = (
-            "s" if method in ["neuralplexer", "flowdock", "rfaa", "chai-lab", "tulip"] else ""
+            "s"
+            if method in ["neuralplexer", "flowdock", "rfaa", "chai-lab", "alphafold3", "tulip"]
+            else ""
         )
         return os.path.join(
             "forks",
@@ -208,10 +212,13 @@ def resolve_method_output_dir(
         "flowdock",
         "rfaa",
         "chai-lab",
+        "alphafold3",
         "tulip",
     ]:
         output_suffix = (
-            "s" if method in ["neuralplexer", "flowdock", "rfaa", "chai-lab", "tulip"] else ""
+            "s"
+            if method in ["neuralplexer", "flowdock", "rfaa", "chai-lab", "alphafold3", "tulip"]
+            else ""
         )
         return os.path.join(
             "forks",
@@ -260,6 +267,7 @@ def resolve_method_input_csv_path(method: str, dataset: str, pocket_only_baselin
         "flowdock",
         "rfaa",
         "chai-lab",
+        "alphafold3",
         "vina",
         "tulip",
     ]:

diff --git a/posebench/analysis/complex_alignment.py b/posebench/analysis/complex_alignment.py
@@ -305,6 +305,7 @@ def main(cfg: DictConfig):
             "flowdock",
             "rfaa",
             "chai-lab",
+            "alphafold3",
         ]:
             output_dir = Path(str(output_dir).replace("_relaxed", ""))
 
@@ -350,6 +351,18 @@ def main(cfg: DictConfig):
                     and "_LIG_" not in file.stem
                 ]
             )
+        elif cfg.method == "alphafold3":
+            output_ligand_files = list(output_dir.rglob(f"*_model_ligand{config}.sdf"))
+            output_ligand_files = sorted(
+                [
+                    file
+                    for file in output_ligand_files
+                    if config == "_relaxed"
+                    or (config == "" and "_relaxed" not in file.stem)
+                    and "_aligned" not in file.stem
+                    and "_LIG_" not in file.stem
+                ]
+            )
         else:
             raise ValueError(f"Invalid method: {cfg.method}")
 
@@ -395,6 +408,16 @@ def main(cfg: DictConfig):
                     and "_aligned" not in file.stem
                 ]
             )
+        elif cfg.method == "alphafold3":
+            output_protein_files = list(output_dir.rglob("*_model_protein.pdb"))
+            output_protein_files = sorted(
+                [
+                    file
+                    for file in output_protein_files
+                    if (config == "_relaxed" or (config == "" and "_relaxed" not in file.stem))
+                    and "_aligned" not in file.stem
+                ]
+            )
         else:
             raise ValueError(f"Invalid method: {cfg.method}")
 
@@ -410,7 +433,7 @@ def main(cfg: DictConfig):
                         )
                     ]
                 )
-            elif cfg.method in ["rfaa", "chai-lab"]:
+            elif cfg.method == "chai-lab":
                 output_protein_files = sorted(
                     [
                         item