-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
234 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
dataset: posebusters_benchmark # the dataset to use - NOTE: must be one of (`posebusters_benchmark`, `astex_diverse`, `dockgen`, `casp15`) | ||
prediction_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/alphafold3/prediction_outputs/${dataset}_${repeat_index} | ||
inference_outputs_dir: ${oc.env:PROJECT_ROOT}/forks/alphafold3/inference/alphafold3_${dataset}_outputs_${repeat_index} | ||
input_data_dir: ${oc.env:PROJECT_ROOT}/data/${dataset}_set # the input protein-ligand complex directory to recursively parse | ||
posebusters_ccd_ids_filepath: ${oc.env:PROJECT_ROOT}/data/posebusters_pdb_ccd_ids.txt # the path to the PoseBusters PDB CCD IDs file that lists the targets that do not contain any crystal contacts | ||
dockgen_test_ids_filepath: ${oc.env:PROJECT_ROOT}/data/dockgen_set/split_test.txt # the path to the DockGen test set IDs file | ||
complex_filepath: null # if not `null`, this should be the path to the complex PDB file for which to extract outputs | ||
complex_id: null # if not `null`, this should be the complex ID of the single complex for which to extract outputs | ||
ligand_smiles: null # if not `null`, this should be the (i.e., `.` fragment-separated) complex ligand SMILES string of the single complex for which to extract outputs | ||
output_dir: null # if not `null`, this should be the path to the output file to which to write the extracted outputs | ||
repeat_index: 1 # the repeat index with which inference was run | ||
pocket_only_baseline: false # whether to prepare the pocket-only baseline |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
# ------------------------------------------------------------------------------------------------------------------------------------- | ||
# Following code curated for PoseBench: (https://github.com/BioinfoMachineLearning/PoseBench) | ||
# ------------------------------------------------------------------------------------------------------------------------------------- | ||
|
||
import logging | ||
import os | ||
|
||
import hydra | ||
import numpy as np | ||
import rootutils | ||
from omegaconf import DictConfig, open_dict | ||
|
||
rootutils.setup_root(__file__, indicator=".project-root", pythonpath=True) | ||
|
||
from posebench.utils.data_utils import ( | ||
extract_protein_and_ligands_with_prody, | ||
parse_inference_inputs_from_dir, | ||
) | ||
|
||
logging.basicConfig(format="[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s") | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
@hydra.main( | ||
version_base="1.3", | ||
config_path="../../configs/data", | ||
config_name="af3_output_extraction.yaml", | ||
) | ||
def main(cfg: DictConfig): | ||
"""Extract proteins and ligands separately from the prediction outputs.""" | ||
pdb_ids = None | ||
if cfg.dataset == "posebusters_benchmark" and cfg.posebusters_ccd_ids_filepath is not None: | ||
assert os.path.exists( | ||
cfg.posebusters_ccd_ids_filepath | ||
), f"Invalid CCD IDs file path for PoseBusters Benchmark: {os.path.exists(cfg.posebusters_ccd_ids_filepath)}." | ||
with open(cfg.posebusters_ccd_ids_filepath) as f: | ||
pdb_ids = set(f.read().splitlines()) | ||
elif cfg.dataset == "dockgen" and cfg.dockgen_test_ids_filepath is not None: | ||
assert os.path.exists( | ||
cfg.dockgen_test_ids_filepath | ||
), f"Invalid test IDs file path for DockGen: {os.path.exists(cfg.dockgen_test_ids_filepath)}." | ||
with open(cfg.dockgen_test_ids_filepath) as f: | ||
pdb_ids = {line.replace(" ", "-") for line in f.read().splitlines()} | ||
elif cfg.dataset not in ["posebusters_benchmark", "astex_diverse", "dockgen", "casp15"]: | ||
raise ValueError(f"Dataset `{cfg.dataset}` not supported.") | ||
|
||
if cfg.pocket_only_baseline: | ||
with open_dict(cfg): | ||
cfg.prediction_outputs_dir = cfg.prediction_outputs_dir.replace( | ||
cfg.dataset, f"{cfg.dataset}_pocket_only" | ||
) | ||
cfg.inference_outputs_dir = cfg.inference_outputs_dir.replace( | ||
f"alphafold3_{cfg.dataset}", f"alphafold3_pocket_only_{cfg.dataset}" | ||
) | ||
|
||
if cfg.complex_filepath is not None: | ||
# process single-complex inputs | ||
assert os.path.exists( | ||
cfg.complex_filepath | ||
), f"Complex PDB file not found: {cfg.complex_filepath}" | ||
assert ( | ||
cfg.complex_id is not None | ||
), "Complex ID must be provided when extracting single complex outputs." | ||
assert ( | ||
cfg.ligand_smiles is not None | ||
), "Ligand SMILES must be provided when extracting single complex outputs." | ||
assert ( | ||
cfg.output_dir is not None | ||
), "Output directory must be provided when extracting single complex outputs." | ||
intermediate_output_filepath = cfg.complex_filepath | ||
final_output_filepath = os.path.join( | ||
cfg.output_dir, cfg.complex_id, os.path.basename(cfg.complex_filepath) | ||
) | ||
os.makedirs(os.path.dirname(final_output_filepath), exist_ok=True) | ||
try: | ||
extract_protein_and_ligands_with_prody( | ||
intermediate_output_filepath, | ||
final_output_filepath.replace(".cif", "_protein.pdb"), | ||
final_output_filepath.replace(".cif", "_ligand.sdf"), | ||
sanitize=False, | ||
add_element_types=True, | ||
ligand_smiles=cfg.ligand_smiles, | ||
) | ||
except Exception as e: | ||
logger.error(f"Failed to extract protein and ligands for {cfg.complex_id} due to: {e}") | ||
else: | ||
# process all complexes in a dataset | ||
smiles_and_pdb_id_list = parse_inference_inputs_from_dir( | ||
cfg.input_data_dir, | ||
pdb_ids=pdb_ids, | ||
) | ||
pdb_id_to_smiles = {pdb_id: smiles for smiles, pdb_id in smiles_and_pdb_id_list} | ||
for item in os.listdir(cfg.prediction_outputs_dir): | ||
output_item_path = os.path.join(cfg.prediction_outputs_dir, item) | ||
|
||
if os.path.isdir(output_item_path): | ||
for file in os.listdir(output_item_path): | ||
if not file.endswith(".cif"): | ||
continue | ||
|
||
if cfg.dataset in ["posebusters_benchmark", "astex_diverse"]: | ||
item = item.upper() | ||
ligand_smiles = pdb_id_to_smiles[item] | ||
elif cfg.dataset == "dockgen": | ||
item = item.upper() | ||
item = "_".join([item.split("_")[0].lower(), *item.split("_")[1:]]) | ||
ligand_smiles = pdb_id_to_smiles[item] | ||
else: | ||
# NOTE: for the `casp15` dataset, standalone ligand SMILES are not available | ||
ligand_smiles = None | ||
|
||
intermediate_output_filepath = os.path.join(output_item_path, file) | ||
final_output_filepath = os.path.join(cfg.inference_outputs_dir, item, file) | ||
os.makedirs(os.path.dirname(final_output_filepath), exist_ok=True) | ||
|
||
try: | ||
extract_protein_and_ligands_with_prody( | ||
intermediate_output_filepath, | ||
final_output_filepath.replace(".cif", "_protein.pdb"), | ||
final_output_filepath.replace(".cif", "_ligand.sdf"), | ||
sanitize=False, | ||
add_element_types=True, | ||
ligand_smiles=ligand_smiles, | ||
) | ||
except Exception as e: | ||
logger.error( | ||
f"Failed to extract protein and ligands for {item} due to: {e}" | ||
) | ||
try: | ||
os.remove(final_output_filepath.replace(".cif", "_protein.pdb")) | ||
os.remove(final_output_filepath.replace(".cif", "_ligand.sdf")) | ||
except Exception as e: | ||
logger.error( | ||
f"Failed to remove partially extracted protein and ligands for {item} due to: {e}" | ||
) | ||
|
||
logger.info( | ||
f"Finished extracting {cfg.dataset} protein and ligands from all prediction outputs." | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters