diff --git a/.gitignore b/.gitignore index d817841..b8a7540 100644 --- a/.gitignore +++ b/.gitignore @@ -164,6 +164,7 @@ configs/local/default.yaml # Forks /workdir/ +/forks/alphafold3/prediction_outputs/ /forks/chai-lab/chai-lab/ /forks/chai-lab/prediction_inputs/ /forks/chai-lab/prediction_outputs/ diff --git a/README.md b/README.md index a42807d..8f057e8 100644 --- a/README.md +++ b/README.md @@ -211,6 +211,10 @@ rm rfaa_benchmark_method_predictions.tar.gz wget https://zenodo.org/records/13858866/files/chai_benchmark_method_predictions.tar.gz tar -xzf chai_benchmark_method_predictions.tar.gz rm chai_benchmark_method_predictions.tar.gz +# AlphaFold 3 predictions and results +wget https://zenodo.org/records/13858866/files/af3_benchmark_method_predictions.tar.gz +tar -xzf af3_benchmark_method_predictions.tar.gz +rm af3_benchmark_method_predictions.tar.gz # AutoDock Vina predictions and results wget https://zenodo.org/records/13858866/files/vina_benchmark_method_predictions.tar.gz tar -xzf vina_benchmark_method_predictions.tar.gz @@ -324,6 +328,7 @@ conda deactivate | `FlowDock` | [Morehead et al.](https://github.com/BioinfoMachineLearning/FlowDock) | ✓ | ✓ | ✓ | ✓ | | `RoseTTAFold-All-Atom` | [Krishna et al.](https://www.science.org/doi/10.1126/science.adl2528) | ✓ | ✓ | ✓ | ✓ | | `Chai-1` | [Chai Discovery](https://chaiassets.com/chai-1/paper/technical_report_v1.pdf) | ✓ | ✓ | ✓ | ✓ | +| `AlphaFold 3` | [Abramson et al.](https://www.nature.com/articles/s41586-024-07487-w) | ✓ | ✓ | ✓ | ✓ | ### Methods available for ensembling @@ -344,6 +349,7 @@ conda deactivate | `FlowDock` | [Morehead et al.](https://github.com/BioinfoMachineLearning/FlowDock) | ✓ | ✓ | ✓ | ✓ | | `RoseTTAFold-All-Atom` | [Krishna et al.](https://www.science.org/doi/10.1126/science.adl2528) | ✓ | ✓ | ✓ | ✓ | | `Chai-1` | [Chai Discovery](https://chaiassets.com/chai-1/paper/technical_report_v1.pdf) | ✓ | ✓ | ✓ | ✓ | +| `AlphaFold 3` | [Abramson et al.](https://www.nature.com/articles/s41586-024-07487-w) | ✓ | ✓ | ✓ | ✓ | **NOTE**: Have a new method to add? Please let us know by creating a pull request. We would be happy to work with you to integrate new methodology into this benchmark! @@ -835,6 +841,71 @@ python3 posebench/analysis/inference_analysis_casp.py method=chai-lab dataset=ca ... ``` +### How to run inference with `AlphaFold 3` + +Run inference (3x) using the academically-available inference code released on [GitHub](https://github.com/google-deepmind/alphafold3), saving each run's structures to a unique output directory located at `forks/alphafold3/prediction_outputs/{dataset=posebusters_benchmark,astex_diverse,dockgen,casp15}_{repeat_index=1,2,3}` + +Then, extract predictions into separate files for proteins and ligands + +```bash +python3 posebench/data/af3_output_extraction.py dataset=posebusters_benchmark repeat_index=1 +... +python3 posebench/data/af3_output_extraction.py dataset=astex_diverse repeat_index=1 +... +python3 posebench/data/af3_output_extraction.py dataset=dockgen repeat_index=1 +... +python3 posebench/data/af3_output_extraction.py dataset=casp15 repeat_index=1 +... +``` + +Relax the generated ligand structures inside of their respective protein pockets + +```bash +python3 posebench/models/inference_relaxation.py method=chai-lab dataset=posebusters_benchmark remove_initial_protein_hydrogens=true repeat_index=1 +... +python3 posebench/models/inference_relaxation.py method=chai-lab dataset=astex_diverse remove_initial_protein_hydrogens=true repeat_index=1 +... +python3 posebench/models/inference_relaxation.py method=chai-lab dataset=dockgen remove_initial_protein_hydrogens=true repeat_index=1 +... +``` + +Align predicted protein-ligand structures to ground-truth complex structures + +```bash +conda activate PyMOL-PoseBench +python3 posebench/analysis/complex_alignment.py method=chai-lab dataset=posebusters_benchmark repeat_index=1 +... +python3 posebench/analysis/complex_alignment.py method=chai-lab dataset=astex_diverse repeat_index=1 +... +python3 posebench/analysis/complex_alignment.py method=chai-lab dataset=dockgen repeat_index=1 +conda deactivate +... +``` + +Analyze inference results for each dataset + +```bash +python3 posebench/analysis/inference_analysis.py method=chai-lab dataset=posebusters_benchmark repeat_index=1 +... +python3 posebench/analysis/inference_analysis.py method=chai-lab dataset=astex_diverse repeat_index=1 +... +python3 posebench/analysis/inference_analysis.py method=chai-lab dataset=dockgen repeat_index=1 +... +``` + +Analyze inference results for the CASP15 dataset + +```bash +# first assemble (unrelaxed and post ranking-relaxed) CASP15-compliant prediction submission files for scoring +python3 posebench/models/ensemble_generation.py ensemble_methods=\[chai-lab\] input_csv_filepath=data/test_cases/casp15/ensemble_inputs.csv output_dir=data/test_cases/casp15/top_chai-lab_ensemble_predictions_1 skip_existing=true relax_method_ligands_post_ranking=false export_file_format=casp15 export_top_n=5 combine_casp_output_files=true max_method_predictions=5 method_top_n_to_select=5 resume=true ensemble_benchmarking=true ensemble_benchmarking_dataset=casp15 cuda_device_index=0 ensemble_benchmarking_repeat_index=1 +python3 posebench/models/ensemble_generation.py ensemble_methods=\[chai-lab\] input_csv_filepath=data/test_cases/casp15/ensemble_inputs.csv output_dir=data/test_cases/casp15/top_chai-lab_ensemble_predictions_1 skip_existing=true relax_method_ligands_post_ranking=true export_file_format=casp15 export_top_n=5 combine_casp_output_files=true max_method_predictions=5 method_top_n_to_select=5 resume=true ensemble_benchmarking=true ensemble_benchmarking_dataset=casp15 cuda_device_index=0 ensemble_benchmarking_repeat_index=1 +# NOTE: the suffixes for both `output_dir` and `ensemble_benchmarking_repeat_index` should be modified to e.g., 2, 3, ... +... +# now score the CASP15-compliant submissions using the official CASP scoring pipeline +python3 posebench/analysis/inference_analysis_casp.py method=chai-lab dataset=casp15 repeat_index=1 +... +``` + ### How to run inference with `AutoDock Vina` Prepare CSV input files @@ -1073,6 +1144,7 @@ rm -rf docs/build/ && sphinx-build docs/source/ docs/build/ # NOTE: errors can s `PoseBench` builds upon the source code and data from the following projects: +- [alphafold3](https://github.com/google-deepmind/alphafold3) - [AutoDock-Vina](https://github.com/ccsb-scripps/AutoDock-Vina) - [casp15_ligand](https://git.scicore.unibas.ch/schwede/casp15_ligand) - [chai-lab](https://github.com/chaidiscovery/chai-lab) diff --git a/configs/data/af3_output_extraction.yaml b/configs/data/af3_output_extraction.yaml new file mode 100644 index 0000000..2903b95 --- /dev/null +++ b/configs/data/af3_output_extraction.yaml @@ -0,0 +1,12 @@ +dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`) +prediction_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/alphafold3/prediction_outputs/${dataset}_${repeat_index} +inference_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/alphafold3/inference/alphafold3_${dataset}_outputs_${repeat_index} +input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse +posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts +dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file +complex_filepath: null # if not `null`, this should be the path to the complex PDB file for which to extract outputs +complex_id: null # if not `null`, this should be the complex ID of the single complex for which to extract outputs +ligand_smiles: null # if not `null`, this should be the (i.e., `.` fragment-separated) complex ligand SMILES string of the single complex for which to extract outputs +output_dir: null # if not `null`, this should be the path to the output file to which to write the extracted outputs +repeat_index: 1 # the repeat index with which inference was run +pocket_only_baseline: false # whether to prepare the pocket-only baseline diff --git a/posebench/data/af3_output_extraction.py b/posebench/data/af3_output_extraction.py new file mode 100644 index 0000000..84bff79 --- /dev/null +++ b/posebench/data/af3_output_extraction.py @@ -0,0 +1,143 @@ +# ------------------------------------------------------------------------------------------------------------------------------------- +# Following code curated for PoseBench: (https://github.com/BioinfoMachineLearning/PoseBench) +# ------------------------------------------------------------------------------------------------------------------------------------- + +import logging +import os + +import hydra +import numpy as np +import rootutils +from omegaconf import DictConfig, open_dict + +rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True) + +from posebench.utils.data_utils import ( + extract_protein_and_ligands_with_prody, + parse_inference_inputs_from_dir, +) + +logging.basicConfig(format="[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + + +@hydra.main( + version_base="1.3", + config_path="../../configs/data", + config_name="af3_output_extraction.yaml", +) +def main(cfg: DictConfig): + """Extract proteins and ligands separately from the prediction outputs.""" + pdb_ids = None + if cfg.dataset == "posebusters_benchmark" and cfg.posebusters_ccd_ids_filepath is not None: + assert os.path.exists( + cfg.posebusters_ccd_ids_filepath + ), f"Invalid CCD IDs file path for PoseBusters Benchmark: {os.path.exists(cfg.posebusters_ccd_ids_filepath)}." + with open(cfg.posebusters_ccd_ids_filepath) as f: + pdb_ids = set(f.read().splitlines()) + elif cfg.dataset == "dockgen" and cfg.dockgen_test_ids_filepath is not None: + assert os.path.exists( + cfg.dockgen_test_ids_filepath + ), f"Invalid test IDs file path for DockGen: {os.path.exists(cfg.dockgen_test_ids_filepath)}." + with open(cfg.dockgen_test_ids_filepath) as f: + pdb_ids = {line.replace(" ", "-") for line in f.read().splitlines()} + elif cfg.dataset not in ["posebusters_benchmark", "astex_diverse", "dockgen", "casp15"]: + raise ValueError(f"Dataset `{cfg.dataset}` not supported.") + + if cfg.pocket_only_baseline: + with open_dict(cfg): + cfg.prediction_outputs_dir = cfg.prediction_outputs_dir.replace( + cfg.dataset, f"{cfg.dataset}_pocket_only" + ) + cfg.inference_outputs_dir = cfg.inference_outputs_dir.replace( + f"alphafold3_{cfg.dataset}", f"alphafold3_pocket_only_{cfg.dataset}" + ) + + if cfg.complex_filepath is not None: + # process single-complex inputs + assert os.path.exists( + cfg.complex_filepath + ), f"Complex PDB file not found: {cfg.complex_filepath}" + assert ( + cfg.complex_id is not None + ), "Complex ID must be provided when extracting single complex outputs." + assert ( + cfg.ligand_smiles is not None + ), "Ligand SMILES must be provided when extracting single complex outputs." + assert ( + cfg.output_dir is not None + ), "Output directory must be provided when extracting single complex outputs." + intermediate_output_filepath = cfg.complex_filepath + final_output_filepath = os.path.join( + cfg.output_dir, cfg.complex_id, os.path.basename(cfg.complex_filepath) + ) + os.makedirs(os.path.dirname(final_output_filepath), exist_ok=True) + try: + extract_protein_and_ligands_with_prody( + intermediate_output_filepath, + final_output_filepath.replace(".cif", "_protein.pdb"), + final_output_filepath.replace(".cif", "_ligand.sdf"), + sanitize=False, + add_element_types=True, + ligand_smiles=cfg.ligand_smiles, + ) + except Exception as e: + logger.error(f"Failed to extract protein and ligands for {cfg.complex_id} due to: {e}") + else: + # process all complexes in a dataset + smiles_and_pdb_id_list = parse_inference_inputs_from_dir( + cfg.input_data_dir, + pdb_ids=pdb_ids, + ) + pdb_id_to_smiles = {pdb_id: smiles for smiles, pdb_id in smiles_and_pdb_id_list} + for item in os.listdir(cfg.prediction_outputs_dir): + output_item_path = os.path.join(cfg.prediction_outputs_dir, item) + + if os.path.isdir(output_item_path): + for file in os.listdir(output_item_path): + if not file.endswith(".cif"): + continue + + if cfg.dataset in ["posebusters_benchmark", "astex_diverse"]: + item = item.upper() + ligand_smiles = pdb_id_to_smiles[item] + elif cfg.dataset == "dockgen": + item = item.upper() + item = "_".join([item.split("_")[0].lower(), *item.split("_")[1:]]) + ligand_smiles = pdb_id_to_smiles[item] + else: + # NOTE: for the `casp15` dataset, standalone ligand SMILES are not available + ligand_smiles = None + + intermediate_output_filepath = os.path.join(output_item_path, file) + final_output_filepath = os.path.join(cfg.inference_outputs_dir, item, file) + os.makedirs(os.path.dirname(final_output_filepath), exist_ok=True) + + try: + extract_protein_and_ligands_with_prody( + intermediate_output_filepath, + final_output_filepath.replace(".cif", "_protein.pdb"), + final_output_filepath.replace(".cif", "_ligand.sdf"), + sanitize=False, + add_element_types=True, + ligand_smiles=ligand_smiles, + ) + except Exception as e: + logger.error( + f"Failed to extract protein and ligands for {item} due to: {e}" + ) + try: + os.remove(final_output_filepath.replace(".cif", "_protein.pdb")) + os.remove(final_output_filepath.replace(".cif", "_ligand.sdf")) + except Exception as e: + logger.error( + f"Failed to remove partially extracted protein and ligands for {item} due to: {e}" + ) + + logger.info( + f"Finished extracting {cfg.dataset} protein and ligands from all prediction outputs." + ) + + +if __name__ == "__main__": + main() diff --git a/posebench/utils/data_utils.py b/posebench/utils/data_utils.py index fd0d530..a959556 100644 --- a/posebench/utils/data_utils.py +++ b/posebench/utils/data_utils.py @@ -5,6 +5,7 @@ import glob import logging import os +import re import shutil import subprocess # nosec from collections import defaultdict @@ -1508,14 +1509,14 @@ def read_ligand_expo( return df.to_dict("index") -def write_pdb_with_prody(protein, pdb_name, add_element_types=False): - """Write a protein to a pdb file using ProDy. +def write_pdb_with_prody(atoms, pdb_name, add_element_types=False): + """Write atoms to a pdb file using ProDy. - :param protein: protein object from prody + :param atoms: atoms object from prody :param pdb_name: base name for the pdb file :param add_element_types: whether to add element types to the pdb file """ - writePDB(pdb_name, protein) + writePDB(pdb_name, atoms) if add_element_types: with open(pdb_name.replace(".pdb", "_elem.pdb"), "w") as f: subprocess.run( # nosec @@ -1556,8 +1557,6 @@ def process_ligand_with_prody( :return: molecule with bond orders assigned """ sub_smiles_provided = sub_smiles is not None - - output = StringIO() sub_mol = ligand.select(f"resname {res_name} and chain {chain} and resnum {resnum}") ligand_expo_mapping = ligand_expo_mapping or read_ligand_expo() @@ -1573,6 +1572,7 @@ def process_ligand_with_prody( else: template = None + output = StringIO() writePDBStream(output, sub_mol) pdb_string = output.getvalue() rd_mol = AllChem.MolFromPDBBlock(pdb_string, sanitize=sanitize)