Begin adding AF3

BioinfoMachineLearning · Dec 9, 2024 · 2b47c5d · 2b47c5d
1 parent e947882
commit 2b47c5d
Show file tree

Hide file tree

Showing 5 changed files with 234 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -164,6 +164,7 @@ configs/local/default.yaml
 
 # Forks
 /workdir/
+/forks/alphafold3/prediction_outputs/
 /forks/chai-lab/chai-lab/
 /forks/chai-lab/prediction_inputs/
 /forks/chai-lab/prediction_outputs/

diff --git a/README.md b/README.md
@@ -211,6 +211,10 @@ rm rfaa_benchmark_method_predictions.tar.gz
 wget https://zenodo.org/records/13858866/files/chai_benchmark_method_predictions.tar.gz
 tar -xzf chai_benchmark_method_predictions.tar.gz
 rm chai_benchmark_method_predictions.tar.gz
+# AlphaFold 3 predictions and results
+wget https://zenodo.org/records/13858866/files/af3_benchmark_method_predictions.tar.gz
+tar -xzf af3_benchmark_method_predictions.tar.gz
+rm af3_benchmark_method_predictions.tar.gz
 # AutoDock Vina predictions and results
 wget https://zenodo.org/records/13858866/files/vina_benchmark_method_predictions.tar.gz
 tar -xzf vina_benchmark_method_predictions.tar.gz
@@ -324,6 +328,7 @@ conda deactivate
 | `FlowDock`             | [Morehead et al.](https://github.com/BioinfoMachineLearning/FlowDock)         | ✓                 | ✓                       | ✓                   | ✓                |
 | `RoseTTAFold-All-Atom` | [Krishna et al.](https://www.science.org/doi/10.1126/science.adl2528)         | ✓                 | ✓                       | ✓                   | ✓                |
 | `Chai-1`               | [Chai Discovery](https://chaiassets.com/chai-1/paper/technical_report_v1.pdf) | ✓                 | ✓                       | ✓                   | ✓                |
+| `AlphaFold 3`          | [Abramson et al.](https://www.nature.com/articles/s41586-024-07487-w)         | ✓                 | ✓                       | ✓                   | ✓                |
 
 ### Methods available for ensembling
 
@@ -344,6 +349,7 @@ conda deactivate
 | `FlowDock`             | [Morehead et al.](https://github.com/BioinfoMachineLearning/FlowDock)         | ✓                 | ✓                       | ✓                   | ✓                |
 | `RoseTTAFold-All-Atom` | [Krishna et al.](https://www.science.org/doi/10.1126/science.adl2528)         | ✓                 | ✓                       | ✓                   | ✓                |
 | `Chai-1`               | [Chai Discovery](https://chaiassets.com/chai-1/paper/technical_report_v1.pdf) | ✓                 | ✓                       | ✓                   | ✓                |
+| `AlphaFold 3`          | [Abramson et al.](https://www.nature.com/articles/s41586-024-07487-w)         | ✓                 | ✓                       | ✓                   | ✓                |
 
 **NOTE**: Have a new method to add? Please let us know by creating a pull request. We would be happy to work with you to integrate new methodology into this benchmark!
 
@@ -835,6 +841,71 @@ python3 posebench/analysis/inference_analysis_casp.py method=chai-lab dataset=ca
 ...
 ```
 
+### How to run inference with `AlphaFold 3`
+
+Run inference (3x) using the academically-available inference code released on [GitHub](https://github.com/google-deepmind/alphafold3), saving each run's structures to a unique output directory located at `forks/alphafold3/prediction_outputs/{dataset=posebusters_benchmark,astex_diverse,dockgen,casp15}_{repeat_index=1,2,3}`
+
+Then, extract predictions into separate files for proteins and ligands
+
+```bash
+python3 posebench/data/af3_output_extraction.py dataset=posebusters_benchmark repeat_index=1
+...
+python3 posebench/data/af3_output_extraction.py dataset=astex_diverse repeat_index=1
+...
+python3 posebench/data/af3_output_extraction.py dataset=dockgen repeat_index=1
+...
+python3 posebench/data/af3_output_extraction.py dataset=casp15 repeat_index=1
+...
+```
+
+Relax the generated ligand structures inside of their respective protein pockets
+
+```bash
+python3 posebench/models/inference_relaxation.py method=chai-lab dataset=posebusters_benchmark remove_initial_protein_hydrogens=true repeat_index=1
+...
+python3 posebench/models/inference_relaxation.py method=chai-lab dataset=astex_diverse remove_initial_protein_hydrogens=true repeat_index=1
+...
+python3 posebench/models/inference_relaxation.py method=chai-lab dataset=dockgen remove_initial_protein_hydrogens=true repeat_index=1
+...
+```
+
+Align predicted protein-ligand structures to ground-truth complex structures
+
+```bash
+conda activate PyMOL-PoseBench
+python3 posebench/analysis/complex_alignment.py method=chai-lab dataset=posebusters_benchmark repeat_index=1
+...
+python3 posebench/analysis/complex_alignment.py method=chai-lab dataset=astex_diverse repeat_index=1
+...
+python3 posebench/analysis/complex_alignment.py method=chai-lab dataset=dockgen repeat_index=1
+conda deactivate
+...
+```
+
+Analyze inference results for each dataset
+
+```bash
+python3 posebench/analysis/inference_analysis.py method=chai-lab dataset=posebusters_benchmark repeat_index=1
+...
+python3 posebench/analysis/inference_analysis.py method=chai-lab dataset=astex_diverse repeat_index=1
+...
+python3 posebench/analysis/inference_analysis.py method=chai-lab dataset=dockgen repeat_index=1
+...
+```
+
+Analyze inference results for the CASP15 dataset
+
+```bash
+# first assemble (unrelaxed and post ranking-relaxed) CASP15-compliant prediction submission files for scoring
+python3 posebench/models/ensemble_generation.py ensemble_methods=\[chai-lab\] input_csv_filepath=data/test_cases/casp15/ensemble_inputs.csv output_dir=data/test_cases/casp15/top_chai-lab_ensemble_predictions_1 skip_existing=true relax_method_ligands_post_ranking=false export_file_format=casp15 export_top_n=5 combine_casp_output_files=true max_method_predictions=5 method_top_n_to_select=5 resume=true ensemble_benchmarking=true ensemble_benchmarking_dataset=casp15 cuda_device_index=0 ensemble_benchmarking_repeat_index=1
+python3 posebench/models/ensemble_generation.py ensemble_methods=\[chai-lab\] input_csv_filepath=data/test_cases/casp15/ensemble_inputs.csv output_dir=data/test_cases/casp15/top_chai-lab_ensemble_predictions_1 skip_existing=true relax_method_ligands_post_ranking=true export_file_format=casp15 export_top_n=5 combine_casp_output_files=true max_method_predictions=5 method_top_n_to_select=5 resume=true ensemble_benchmarking=true ensemble_benchmarking_dataset=casp15 cuda_device_index=0 ensemble_benchmarking_repeat_index=1
+# NOTE: the suffixes for both `output_dir` and `ensemble_benchmarking_repeat_index` should be modified to e.g., 2, 3, ...
+...
+# now score the CASP15-compliant submissions using the official CASP scoring pipeline
+python3 posebench/analysis/inference_analysis_casp.py method=chai-lab dataset=casp15 repeat_index=1
+...
+```
+
 ### How to run inference with `AutoDock Vina`
 
 Prepare CSV input files
@@ -1073,6 +1144,7 @@ rm -rf docs/build/ && sphinx-build docs/source/ docs/build/ # NOTE: errors can s
 
 `PoseBench` builds upon the source code and data from the following projects:
 
+- [alphafold3](https://github.com/google-deepmind/alphafold3)
 - [AutoDock-Vina](https://github.com/ccsb-scripps/AutoDock-Vina)
 - [casp15_ligand](https://git.scicore.unibas.ch/schwede/casp15_ligand)
 - [chai-lab](https://github.com/chaidiscovery/chai-lab)

diff --git a/configs/data/af3_output_extraction.yaml b/configs/data/af3_output_extraction.yaml
@@ -0,0 +1,12 @@
+dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`)
+prediction_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/alphafold3/prediction_outputs/${dataset}_${repeat_index}
+inference_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/alphafold3/inference/alphafold3_${dataset}_outputs_${repeat_index}
+input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse
+posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts
+dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file
+complex_filepath: null # if not `null`, this should be the path to the complex PDB file for which to extract outputs
+complex_id: null # if not `null`, this should be the complex ID of the single complex for which to extract outputs
+ligand_smiles: null # if not `null`, this should be the (i.e., `.` fragment-separated) complex ligand SMILES string of the single complex for which to extract outputs
+output_dir: null # if not `null`, this should be the path to the output file to which to write the extracted outputs
+repeat_index: 1 # the repeat index with which inference was run
+pocket_only_baseline: false # whether to prepare the pocket-only baseline
diff --git a/posebench/data/af3_output_extraction.py b/posebench/data/af3_output_extraction.py
@@ -0,0 +1,143 @@
+# -------------------------------------------------------------------------------------------------------------------------------------
+# Following code curated for PoseBench: (https://github.com/BioinfoMachineLearning/PoseBench)
+# -------------------------------------------------------------------------------------------------------------------------------------
+
+import logging
+import os
+
+import hydra
+import numpy as np
+import rootutils
+from omegaconf import DictConfig, open_dict
+
+rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True)
+
+from posebench.utils.data_utils import (
+    extract_protein_and_ligands_with_prody,
+    parse_inference_inputs_from_dir,
+)
+
+logging.basicConfig(format="[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+
+@hydra.main(
+    version_base="1.3",
+    config_path="../../configs/data",
+    config_name="af3_output_extraction.yaml",
+)
+def main(cfg: DictConfig):
+    """Extract proteins and ligands separately from the prediction outputs."""
+    pdb_ids = None
+    if cfg.dataset == "posebusters_benchmark" and cfg.posebusters_ccd_ids_filepath is not None:
+        assert os.path.exists(
+            cfg.posebusters_ccd_ids_filepath
+        ), f"Invalid CCD IDs file path for PoseBusters Benchmark: {os.path.exists(cfg.posebusters_ccd_ids_filepath)}."
+        with open(cfg.posebusters_ccd_ids_filepath) as f:
+            pdb_ids = set(f.read().splitlines())
+    elif cfg.dataset == "dockgen" and cfg.dockgen_test_ids_filepath is not None:
+        assert os.path.exists(
+            cfg.dockgen_test_ids_filepath
+        ), f"Invalid test IDs file path for DockGen: {os.path.exists(cfg.dockgen_test_ids_filepath)}."
+        with open(cfg.dockgen_test_ids_filepath) as f:
+            pdb_ids = {line.replace(" ", "-") for line in f.read().splitlines()}
+    elif cfg.dataset not in ["posebusters_benchmark", "astex_diverse", "dockgen", "casp15"]:
+        raise ValueError(f"Dataset `{cfg.dataset}` not supported.")
+
+    if cfg.pocket_only_baseline:
+        with open_dict(cfg):
+            cfg.prediction_outputs_dir = cfg.prediction_outputs_dir.replace(
+                cfg.dataset, f"{cfg.dataset}_pocket_only"
+            )
+            cfg.inference_outputs_dir = cfg.inference_outputs_dir.replace(
+                f"alphafold3_{cfg.dataset}", f"alphafold3_pocket_only_{cfg.dataset}"
+            )
+
+    if cfg.complex_filepath is not None:
+        # process single-complex inputs
+        assert os.path.exists(
+            cfg.complex_filepath
+        ), f"Complex PDB file not found: {cfg.complex_filepath}"
+        assert (
+            cfg.complex_id is not None
+        ), "Complex ID must be provided when extracting single complex outputs."
+        assert (
+            cfg.ligand_smiles is not None
+        ), "Ligand SMILES must be provided when extracting single complex outputs."
+        assert (
+            cfg.output_dir is not None
+        ), "Output directory must be provided when extracting single complex outputs."
+        intermediate_output_filepath = cfg.complex_filepath
+        final_output_filepath = os.path.join(
+            cfg.output_dir, cfg.complex_id, os.path.basename(cfg.complex_filepath)
+        )
+        os.makedirs(os.path.dirname(final_output_filepath), exist_ok=True)
+        try:
+            extract_protein_and_ligands_with_prody(
+                intermediate_output_filepath,
+                final_output_filepath.replace(".cif", "_protein.pdb"),
+                final_output_filepath.replace(".cif", "_ligand.sdf"),
+                sanitize=False,
+                add_element_types=True,
+                ligand_smiles=cfg.ligand_smiles,
+            )
+        except Exception as e:
+            logger.error(f"Failed to extract protein and ligands for {cfg.complex_id} due to: {e}")
+    else:
+        # process all complexes in a dataset
+        smiles_and_pdb_id_list = parse_inference_inputs_from_dir(
+            cfg.input_data_dir,
+            pdb_ids=pdb_ids,
+        )
+        pdb_id_to_smiles = {pdb_id: smiles for smiles, pdb_id in smiles_and_pdb_id_list}
+        for item in os.listdir(cfg.prediction_outputs_dir):
+            output_item_path = os.path.join(cfg.prediction_outputs_dir, item)
+
+            if os.path.isdir(output_item_path):
+                for file in os.listdir(output_item_path):
+                    if not file.endswith(".cif"):
+                        continue
+
+                    if cfg.dataset in ["posebusters_benchmark", "astex_diverse"]:
+                        item = item.upper()
+                        ligand_smiles = pdb_id_to_smiles[item]
+                    elif cfg.dataset == "dockgen":
+                        item = item.upper()
+                        item = "_".join([item.split("_")[0].lower(), *item.split("_")[1:]])
+                        ligand_smiles = pdb_id_to_smiles[item]
+                    else:
+                        # NOTE: for the `casp15` dataset, standalone ligand SMILES are not available
+                        ligand_smiles = None
+
+                    intermediate_output_filepath = os.path.join(output_item_path, file)
+                    final_output_filepath = os.path.join(cfg.inference_outputs_dir, item, file)
+                    os.makedirs(os.path.dirname(final_output_filepath), exist_ok=True)
+
+                    try:
+                        extract_protein_and_ligands_with_prody(
+                            intermediate_output_filepath,
+                            final_output_filepath.replace(".cif", "_protein.pdb"),
+                            final_output_filepath.replace(".cif", "_ligand.sdf"),
+                            sanitize=False,
+                            add_element_types=True,
+                            ligand_smiles=ligand_smiles,
+                        )
+                    except Exception as e:
+                        logger.error(
+                            f"Failed to extract protein and ligands for {item} due to: {e}"
+                        )
+                        try:
+                            os.remove(final_output_filepath.replace(".cif", "_protein.pdb"))
+                            os.remove(final_output_filepath.replace(".cif", "_ligand.sdf"))
+                        except Exception as e:
+                            logger.error(
+                                f"Failed to remove partially extracted protein and ligands for {item} due to: {e}"
+                            )
+
+    logger.info(
+        f"Finished extracting {cfg.dataset} protein and ligands from all prediction outputs."
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/posebench/utils/data_utils.py b/posebench/utils/data_utils.py
@@ -5,6 +5,7 @@
 import glob
 import logging
 import os
+import re
 import shutil
 import subprocess  # nosec
 from collections import defaultdict
@@ -1508,14 +1509,14 @@ def read_ligand_expo(
     return df.to_dict("index")
 
 
-def write_pdb_with_prody(protein, pdb_name, add_element_types=False):
-    """Write a protein to a pdb file using ProDy.
+def write_pdb_with_prody(atoms, pdb_name, add_element_types=False):
+    """Write atoms to a pdb file using ProDy.
 
-    :param protein: protein object from prody
+    :param atoms: atoms object from prody
     :param pdb_name: base name for the pdb file
     :param add_element_types: whether to add element types to the pdb file
     """
-    writePDB(pdb_name, protein)
+    writePDB(pdb_name, atoms)
     if add_element_types:
         with open(pdb_name.replace(".pdb", "_elem.pdb"), "w") as f:
             subprocess.run(  # nosec
@@ -1556,8 +1557,6 @@ def process_ligand_with_prody(
     :return: molecule with bond orders assigned
     """
     sub_smiles_provided = sub_smiles is not None
-
-    output = StringIO()
     sub_mol = ligand.select(f"resname {res_name} and chain {chain} and resnum {resnum}")
 
     ligand_expo_mapping = ligand_expo_mapping or read_ligand_expo()
@@ -1573,6 +1572,7 @@ def process_ligand_with_prody(
     else:
         template = None
 
+    output = StringIO()
     writePDBStream(output, sub_mol)
     pdb_string = output.getvalue()
     rd_mol = AllChem.MolFromPDBBlock(pdb_string, sanitize=sanitize)