From 8373e8f7eaec1c165e2b803d1e7ef1288926085c Mon Sep 17 00:00:00 2001 From: Alex Morehead Date: Tue, 4 Jun 2024 12:18:19 -0500 Subject: [PATCH] Fix FABind, DynamicBind, and RFAA Conda environments --- README.md | 2 +- environments/dynamicbind_environment.yaml | 2 +- environments/fabind_environment.yaml | 11 +++--- environments/rfaa_environment.yaml | 3 +- forks/RoseTTAFold-All-Atom/README.md | 38 +++++++++++-------- .../rf2aa/config/inference/protein_sm.yaml | 23 +++-------- 6 files changed, 37 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index 27a80401..daed428f 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ conda activate forks/DynamicBind/DynamicBind/ # NOTE: one still needs to use `c mamba env create -f environments/neuralplexer_environment.yaml --prefix forks/NeuralPLexer/NeuralPLexer/ conda activate forks/NeuralPLexer/NeuralPLexer/ # NOTE: one still needs to use `conda` to (de)activate environments cd forks/NeuralPLexer/ && pip3 install -e . && cd ../../ -# - RoseTTAFold-All-Atom environment (~14 GB) +# - RoseTTAFold-All-Atom environment (~14 GB) - NOTE: after running these commands, follow the installation instructions in `forks/RoseTTAFold-All-Atom/README.md` starting at Step 4 (with `forks/RoseTTAFold-All-Atom/` as the current working directory) mamba env create -f environments/rfaa_environment.yaml --prefix forks/RoseTTAFold-All-Atom/RFAA/ conda activate forks/RoseTTAFold-All-Atom/RFAA/ # NOTE: one still needs to use `conda` to (de)activate environments cd forks/RoseTTAFold-All-Atom/rf2aa/SE3Transformer/ && pip3 install --no-cache-dir -r requirements.txt && python3 setup.py install && cd ../../../../ diff --git a/environments/dynamicbind_environment.yaml b/environments/dynamicbind_environment.yaml index 36c2cc37..aec89961 100644 --- a/environments/dynamicbind_environment.yaml +++ b/environments/dynamicbind_environment.yaml @@ -239,7 +239,7 @@ dependencies: - platformdirs==2.5.2 - prompt-toolkit==3.0.36 - psutil==5.9.8 - - git+https://github.com/pyg-team/pyg-lib.git + - https://data.pyg.org/whl/torch-2.1.0%2Bcu118/pyg_lib-0.4.0%2Bpt21cu118-cp39-cp39-linux_x86_64.whl - pygments==2.15.1 - pyopenssl==23.0.0 - python-dotenv==1.0.1 diff --git a/environments/fabind_environment.yaml b/environments/fabind_environment.yaml index bf2f5d8f..42174792 100644 --- a/environments/fabind_environment.yaml +++ b/environments/fabind_environment.yaml @@ -131,7 +131,7 @@ dependencies: - pyarrow==15.0.0 - pyasn1==0.5.1 - pyasn1-modules==0.3.0 - - https://data.pyg.org/whl/torch-1.12.0%2Bcu113/pyg_lib-0.2.0%2Bpt112cu113-cp39-cp39-linux_x86_64.whl + - https://data.pyg.org/whl/torch-1.12.0%2Bcu113/pyg_lib-0.2.0%2Bpt112cu113-cp38-cp38-linux_x86_64.whl - pyparsing==3.1.1 - python-dateutil==2.8.2 - python-dotenv==1.0.1 @@ -149,12 +149,11 @@ dependencies: - tensorboard==2.14.0 - tensorboard-data-server==0.7.2 - threadpoolctl==3.2.0 - - https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_cluster-1.6.0%2Bpt112cu113-cp39-cp39-linux_x86_64.whl + - https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_cluster-1.6.0%2Bpt112cu113-cp38-cp38-linux_x86_64.whl - torch-geometric==2.4.0 - - https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_scatter-2.1.0%2Bpt112cu113-cp39-cp39-linux_x86_64.whl - - https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_sparse-0.6.15%2Bpt112cu113-cp39-cp39-linux_x86_64.whl - - torch-spline-conv==1.2.1+pt112cu113 - - https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_spline_conv-1.2.1%2Bpt112cu113-cp39-cp39-linux_x86_64.whl + - https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_scatter-2.1.0%2Bpt112cu113-cp38-cp38-linux_x86_64.whl + - https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_sparse-0.6.15%2Bpt112cu113-cp38-cp38-linux_x86_64.whl + - https://data.pyg.org/whl/torch-1.12.0%2Bcu113/torch_spline_conv-1.2.1%2Bpt112cu113-cp38-cp38-linux_x86_64.whl - torchdrug==0.1.2 - torchmetrics==0.10.2 - tqdm==4.66.1 diff --git a/environments/rfaa_environment.yaml b/environments/rfaa_environment.yaml index 8efea326..ecc21d42 100644 --- a/environments/rfaa_environment.yaml +++ b/environments/rfaa_environment.yaml @@ -1,5 +1,6 @@ name: RFAA channels: + - predector - pyg - bioconda - pytorch @@ -368,6 +369,7 @@ dependencies: - scikit-learn=1.4.1.post1=py310h1fdf081_0 - send2trash=1.8.3=pyh0d859eb_0 - setuptools=69.1.1=pyhd8ed1ab_0 + - signalp6=6.0g=1 - sip=6.7.12=py310hc6cd4ac_0 - six=1.16.0=pyh6c4a22f_0 - smirnoff99frosst=1.1.0=pyh44b312d_0 @@ -481,7 +483,6 @@ dependencies: - scipy==1.13.0 - sentry-sdk==1.41.0 - shortuuid==1.0.12 - - signalp6==6.0+h - smmap==5.0.1 - subprocess32==3.5.4 - timeout-decorator==0.5.0 diff --git a/forks/RoseTTAFold-All-Atom/README.md b/forks/RoseTTAFold-All-Atom/README.md index 019b1b5a..501ad01d 100644 --- a/forks/RoseTTAFold-All-Atom/README.md +++ b/forks/RoseTTAFold-All-Atom/README.md @@ -3,7 +3,7 @@ Code for RoseTTAFold All-Atom

alt text

-RoseTTAFold All-Atom is a biomolecular structure prediction neural network that can predict a broad range of biomolecular assemblies including proteins, nucleic acids, small molecules, covalent modifications and metals as outlined in the RFAA paper. +RoseTTAFold All-Atom is a biomolecular structure prediction neural network that can predict a broad range of biomolecular assemblies including proteins, nucleic acids, small molecules, covalent modifications and metals as outlined in the RFAA paper. RFAA is not accurate for all cases, but produces useful error estimates to allow users to identify accurate predictions. Below are the instructions for setting up and using the model. @@ -54,16 +54,11 @@ mv $CONDA_PREFIX/lib/python3.10/site-packages/signalp/model_weights/distilled_mo ``` bash install_dependencies.sh ``` -6. Add BLAST patch -``` -wget https://ftp.ncbi.nlm.nih.gov/blast/executables/legacy.NOTSUPPORTED/2.2.26/blast-2.2.26-x64-linux.tar.gz -tar -zxvf blast-2.2.26-x64-linux.tar.gz -``` -6. Download the model weights. +6. Download the model weights (if not already downloaded) ``` wget http://files.ipd.uw.edu/pub/RF-All-Atom/weights/RFAA_paper_weights.pt ``` -7. Download sequence databases for MSA and template generation. +7. Download sequence databases for MSA and template generation ``` # uniref30 [46G] wget http://wwwuser.gwdg.de/~compbiol/uniclust/2020_06/UniRef30_2020_06_hhsuite.tar.gz @@ -79,7 +74,17 @@ tar xfz bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar.gz -C ./bfd wget https://files.ipd.uw.edu/pub/RoseTTAFold/pdb100_2021Mar03.tar.gz tar xfz pdb100_2021Mar03.tar.gz ``` +**NOTE:** Make sure to update `DB_UR30` and `DB_BFD` (on Lines 19 and 20 of `make_msa.sh`) as well as `database_params.hhdb` (on Line 6 of `rf2aa/config/inference/base.yaml`) to list the absolute (base) paths to these respective local databases. For example, one may set these values to `DB_UR30="/bmlfast/rfaa_databases/uniref30/UniRef30_2020_06"`, `DB_BFD="/bmlfast/rfaa_databases/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt"`, and `hhdb: "/bmlfast/rfaa_databases/pdb100_2021Mar03/pdb100_2021Mar03"`. +8. Download `BLAST` +``` +wget https://ftp.ncbi.nlm.nih.gov/blast/executables/legacy.NOTSUPPORTED/2.2.26/blast-2.2.26-x64-linux.tar.gz +mkdir -p blast-2.2.26 +tar -xf blast-2.2.26-x64-linux.tar.gz -C blast-2.2.26 +cp -r blast-2.2.26/blast-2.2.26/ blast-2.2.26_bk +rm -r blast-2.2.26 +mv blast-2.2.26_bk/ blast-2.2.26 +``` ### Inference Configs Using Hydra @@ -150,27 +155,28 @@ python -m rf2aa.run_inference --config-name nucleic_acid ### Predicting Protein Small Molecule Complexes To predict protein small molecule complexes, the syntax to input the protein remains the same. Adding in the small molecule works similarly to other inputs. -Here is an example (from `rf2aa/config/inference/protein_complex_sm.yaml`): +Here is an example (from `rf2aa/config/inference/protein_sm.yaml`): ``` defaults: - base - -job_name: 7qxr +job_name: "3fap" protein_inputs: - A: - fasta_file: examples/protein/7qxr.fasta + A: + fasta_file: examples/protein/3fap_A.fasta + B: + fasta_file: examples/protein/3fap_B.fasta sm_inputs: - B: - input: examples/small_molecule/NSW_ideal.sdf + C: + input: examples/small_molecule/ARD_ideal.sdf input_type: "sdf" ``` Small molecule inputs are provided as sdf files or smiles strings and users are **required** to provide both an input and an input_type field for every small molecule that they want to provide. Metal ions can also be provided as sdf files or smiles strings. To predict the example: ``` -python -m rf2aa.run_inference --config-name protein_complex_sm +python -m rf2aa.run_inference --config-name protein_sm ``` ### Predicting Higher Order Complexes diff --git a/forks/RoseTTAFold-All-Atom/rf2aa/config/inference/protein_sm.yaml b/forks/RoseTTAFold-All-Atom/rf2aa/config/inference/protein_sm.yaml index 3f3dad6d..939e13e4 100644 --- a/forks/RoseTTAFold-All-Atom/rf2aa/config/inference/protein_sm.yaml +++ b/forks/RoseTTAFold-All-Atom/rf2aa/config/inference/protein_sm.yaml @@ -1,24 +1,13 @@ defaults: - base -job_name: "T1188" + +job_name: 7qxr protein_inputs: - A: - fasta_file: examples/protein/T1188_A.fasta + A: + fasta_file: examples/protein/7qxr.fasta sm_inputs: B: - input: CN1CNC2C1C(O)N(CCCN1C(O)C3C(NCN3C)N(C)C1O)C(O)N2C - input_type: "smiles" - C: - input: Cn1cnc2c1c(=O)n(CCCn1c(=O)c3c(ncn3C)n(C)c1=O)c(=O)n2C - input_type: "smiles" - D: - input: [Cd+2] - input_type: "smiles" - E: - input: [Cd] - input_type: "smiles" - F: - input: [Co] - input_type: "smiles" \ No newline at end of file + input: examples/small_molecule/NSW_ideal.sdf + input_type: "sdf" \ No newline at end of file