Skip to content

Commit

Permalink
Add full dataset interaction notebook materials
Browse files Browse the repository at this point in the history
  • Loading branch information
amorehead committed Jun 3, 2024
1 parent fabac54 commit 8121755
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 6 deletions.
Binary file not shown.
Binary file added notebooks/dataset_interaction_analysis.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dataset Analysis Plotting"
"## Dataset Interaction Analysis Plotting"
]
},
{
Expand Down Expand Up @@ -67,19 +67,23 @@
"source": [
"ad_set_dir = os.path.join(\"..\", \"data\", \"astex_diverse_set\")\n",
"pb_set_dir = os.path.join(\"..\", \"data\", \"posebusters_benchmark_set\")\n",
"dg_set_dir = os.path.join(\"..\", \"data\", \"dockgen_set\")\n",
"casp15_set_dir = os.path.join(\n",
" \"..\",\n",
" \"data\",\n",
" \"casp15_set\",\n",
" \"casp15_set_public\",\n",
" \"targets\",\n",
")\n",
") # NOTE: change to `casp15_set` directory as needed\n",
"assert os.path.exists(\n",
" ad_set_dir\n",
"), \"Please download the Astex Diverse set from `https://zenodo.org/records/11199233` before proceeding.\"\n",
"assert os.path.exists(\n",
" pb_set_dir\n",
"), \"Please download the PoseBusters Benchmark set from `https://zenodo.org/records/11199233` before proceeding.\"\n",
"assert os.path.exists(\n",
" dg_set_dir\n",
"), \"Please download the DockGen set from `https://zenodo.org/records/11199233` before proceeding.\"\n",
"assert os.path.exists(\n",
" casp15_set_dir\n",
"), \"Please download the (public) CASP15 set from `https://zenodo.org/records/11199233` before proceeding.\"\n",
"\n",
Expand Down Expand Up @@ -248,6 +252,68 @@
" store.put(f\"df_{i}\", df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Analyze `DockGen` set interactions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if not os.path.exists(\"dockgen_interaction_dataframes.h5\"):\n",
" dockgen_test_ids_filepath = os.path.join(\n",
" \"..\", \"data\", \"dockgen_set\", \"split_test.txt\"\n",
" ) # NOTE: change as needed\n",
" assert os.path.exists(\n",
" dockgen_test_ids_filepath\n",
" ), f\"Invalid test IDs filepath for DockGen: {os.path.exists(dockgen_test_ids_filepath)}.\"\n",
" with open(dockgen_test_ids_filepath) as f:\n",
" pdb_ids = {line.replace(\" \", \"-\") for line in f.read().splitlines()}\n",
" dg_protein_ligand_filepath_pairs = []\n",
" for item in os.listdir(dg_set_dir):\n",
" if item not in pdb_ids:\n",
" continue\n",
" item_path = os.path.join(dg_set_dir, item)\n",
" if os.path.isdir(item_path):\n",
" protein_filepath = os.path.join(item_path, f\"{item}_protein_processed.pdb\")\n",
" ligand_filepath = os.path.join(item_path, f\"{item}_ligand.pdb\")\n",
" if os.path.exists(protein_filepath) and os.path.exists(ligand_filepath):\n",
" dg_protein_ligand_filepath_pairs.append((protein_filepath, ligand_filepath))\n",
"\n",
" pc = (\n",
" PoseCheck()\n",
" ) # NOTE: despite what `PoseCheck` might say, `reduce` should be available in the `PoseBench` environment\n",
" dg_protein_ligand_interaction_dfs = []\n",
" for protein_filepath, ligand_filepath in tqdm(\n",
" dg_protein_ligand_filepath_pairs, desc=\"Processing DockGen set\"\n",
" ):\n",
" try:\n",
" temp_protein_filepath = create_temp_pdb_with_only_molecule_type_residues(\n",
" protein_filepath, molecule_type=\"protein\"\n",
" )\n",
" ligand_mol = Chem.MolFromPDBFile(ligand_filepath)\n",
" if ligand_mol is None:\n",
" ligand_mol = Chem.MolFromPDFile(ligand_filepath, sanitize=False)\n",
" pc.load_protein_from_pdb(temp_protein_filepath)\n",
" pc.load_ligands_from_mols([ligand_mol])\n",
" dg_protein_ligand_interaction_dfs.append(pc.calculate_interactions())\n",
" except Exception as e:\n",
" print(\n",
" f\"Error processing Dockgen filepath pari {temp_protein_filepath} and {ligand_filepath} due to: {e}. Skipping...\"\n",
" )\n",
" continue\n",
"\n",
" # NOTE: we iteratively save the interaction dataframes to an HDF5 file\n",
" with pd.HDFStore(\"dockgen_interaction_dataframes.h5\") as store:\n",
" for i, df in enumerate(dg_protein_ligand_interaction_dfs):\n",
" store.put(f\"df_{i}\", df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -293,8 +359,7 @@
" pc.load_ligands_from_mols(\n",
" Chem.GetMolFrags(ligand_mol, asMols=True, sanitizeFrags=False)\n",
" )\n",
" interactions = pc.calculate_interactions()\n",
" casp15_protein_ligand_interaction_dfs.append(interactions)\n",
" casp15_protein_ligand_interaction_dfs.append(pc.calculate_interactions())\n",
" except Exception as e:\n",
" print(\n",
" f\"Error processing CASP15 target {protein_ligand_complex_filepath} due to: {e}. Skipping...\"\n",
Expand Down Expand Up @@ -366,6 +431,9 @@
" process_dataset(\"posebusters_benchmark_interaction_dataframes.h5\", \"PoseBusters Benchmark\")\n",
" )\n",
"\n",
"if os.path.exists(\"dockgen_interaction_dataframes.h5\"):\n",
" dfs.append(process_dataset(\"dockgen_interaction_dataframes.h5\", \"DockGen\"))\n",
"\n",
"if os.path.exists(\"casp15_interaction_dataframes.h5\"):\n",
" dfs.append(process_dataset(\"casp15_interaction_dataframes.h5\", \"CASP15\"))\n",
"\n",
Expand Down Expand Up @@ -416,7 +484,7 @@
" ax.grid(True)\n",
"\n",
"plt.tight_layout()\n",
"plt.savefig(\"astex_posebusters_casp15_interaction_analysis.png\", dpi=300)\n",
"plt.savefig(\"dataset_interaction_analysis.png\", dpi=300)\n",
"plt.show()"
]
}
Expand Down
Binary file added notebooks/dockgen_interaction_dataframes.h5
Binary file not shown.

0 comments on commit 8121755

Please sign in to comment.