Compare commits
45 Commits
6381ddd6e9
...
099769d2ec
Author | SHA1 | Date |
---|---|---|
Christina Floristean | 099769d2ec | |
Jennifer Wei | 2338b896c7 | |
dependabot[bot] | dcd809d9c2 | |
Jennifer Wei | 51472e756a | |
dependabot[bot] | 13728b1203 | |
Jennifer Wei | f7dba95f0b | |
Jennifer Wei | e3716118cd | |
Jennifer Wei | 6df89c763f | |
Jennifer Wei | 13f0f6fe16 | |
Sachin Kadyan | 9e32781fd6 | |
jnwei | f68a6c694b | |
jnwei | a5c69a79c6 | |
jnwei | e2bb3c4b90 | |
jnwei | 6fe34248b2 | |
jnwei | 5efba4425a | |
jnwei | 3817d94098 | |
jnwei | a5a86d4323 | |
Jennifer Wei | f06657fe8a | |
Sachin Kadyan | 2d4fe4f414 | |
jnwei | a90da39554 | |
Sachin Kadyan | 86b990d6ed | |
Sachin Kadyan | 8185c30775 | |
Sachin Kadyan | 4c8e37644e | |
jnwei | 5f5c8f2a5b | |
Matthew W. Thompson | 7666c80272 | |
Matthew W. Thompson | 582103505d | |
Matthew W. Thompson | 736d668741 | |
Matthew W. Thompson | 32c11376d7 | |
Matthew W. Thompson | f86d42f40e | |
Matthew W. Thompson | 6bf5c8cea1 | |
Sachin Kadyan | 92835fd5e6 | |
Sachin Kadyan | 0026173e23 | |
Sachin Kadyan | bcc6d97b69 | |
Gustaf Ahdritz | 0c20e3c989 | |
jnwei | d6ae9f5894 | |
Jennifer Wei | b3a118fc83 | |
Jennifer Wei | 2893fd934b | |
Jennifer Wei | 3e3f07c7f2 | |
Jennifer Wei | fcba33580e | |
Gustaf Ahdritz | 2300f6720d | |
Jennifer | d77a8dabea | |
Jennifer | fb34a0cb62 | |
Jennifer Wei | 705c26773d | |
Jennifer Wei | 4fde713c05 | |
Jennifer Wei | 7922bd57f1 |
|
@ -0,0 +1,7 @@
|
|||
version: 2
|
||||
updates:
|
||||
|
||||
- package-ecosystem: "github-actions"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "daily"
|
|
@ -10,6 +10,6 @@ jobs:
|
|||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/checkout@v4
|
||||
- name: Build the Docker image
|
||||
run: docker build . --file Dockerfile --tag openfold:$(date +%s)
|
|
@ -4,8 +4,8 @@ jobs:
|
|||
undefined_names:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- uses: actions/setup-python@v2
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-python@v4
|
||||
- run: pip install --upgrade pip
|
||||
- run: pip install flake8
|
||||
- run: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
|
||||
|
|
13
Dockerfile
13
Dockerfile
|
@ -1,4 +1,4 @@
|
|||
FROM nvidia/cuda:11.3.1-cudnn8-runtime-ubuntu18.04
|
||||
FROM nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04
|
||||
|
||||
# metainformation
|
||||
LABEL org.opencontainers.image.version = "1.0.0"
|
||||
|
@ -13,24 +13,23 @@ RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/
|
|||
|
||||
RUN apt-get update && apt-get install -y wget libxml2 cuda-minimal-build-11-3 libcusparse-dev-11-3 libcublas-dev-11-3 libcusolver-dev-11-3 git
|
||||
RUN wget -P /tmp \
|
||||
"https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" \
|
||||
&& bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p /opt/conda \
|
||||
&& rm /tmp/Miniconda3-latest-Linux-x86_64.sh
|
||||
"https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh" \
|
||||
&& bash /tmp/Miniforge3-Linux-x86_64.sh -b -p /opt/conda \
|
||||
&& rm /tmp/Miniforge3-Linux-x86_64.sh
|
||||
ENV PATH /opt/conda/bin:$PATH
|
||||
|
||||
COPY environment.yml /opt/openfold/environment.yml
|
||||
|
||||
# installing into the base environment since the docker container wont do anything other than run openfold
|
||||
RUN conda env update -n base --file /opt/openfold/environment.yml && conda clean --all
|
||||
RUN mamba env update -n base --file /opt/openfold/environment.yml && mamba clean --all
|
||||
RUN export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}
|
||||
|
||||
COPY openfold /opt/openfold/openfold
|
||||
COPY scripts /opt/openfold/scripts
|
||||
COPY run_pretrained_openfold.py /opt/openfold/run_pretrained_openfold.py
|
||||
COPY train_openfold.py /opt/openfold/train_openfold.py
|
||||
COPY setup.py /opt/openfold/setup.py
|
||||
COPY lib/openmm.patch /opt/openfold/lib/openmm.patch
|
||||
RUN wget -q -P /opt/openfold/openfold/resources \
|
||||
https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
|
||||
RUN patch -p0 -d /opt/conda/lib/python3.9/site-packages/ < /opt/openfold/lib/openmm.patch
|
||||
WORKDIR /opt/openfold
|
||||
RUN python3 setup.py install
|
||||
|
|
75
README.md
75
README.md
|
@ -48,37 +48,19 @@ and one of {`jackhmmer`, [MMseqs2](https://github.com/soedinglab/mmseqs2) (night
|
|||
installed on on your system. You'll need `git-lfs` to download OpenFold parameters.
|
||||
Finally, some download scripts require `aria2c` and `aws`.
|
||||
|
||||
For convenience, we provide a script that installs Miniconda locally, creates a
|
||||
`conda` virtual environment, installs all Python dependencies, and downloads
|
||||
useful resources, including both sets of model parameters. Run:
|
||||
This package is currently supported for CUDA 11 and Pytorch 1.12
|
||||
|
||||
```bash
|
||||
scripts/install_third_party_dependencies.sh
|
||||
```
|
||||
To install:
|
||||
1. Clone the repository, e.g. `git clone https://github.com/aqlaboratory/openfold.git`
|
||||
1. From the `openfold` repo:
|
||||
- Create a [Mamba]("https://github.com/conda-forge/miniforge/releases/latest/download/) environment, e.g.
|
||||
`mamba env create -n openfold_env -f environment.yml`
|
||||
Mamba is recommended as the dependencies required by OpenFold are quite large and mamba can speed up the process.
|
||||
- Activate the environment, e.g `conda activate openfold_env`
|
||||
1. Run `scripts/install_third_party_dependencies.sh` to configure kernels and folding resources.
|
||||
|
||||
To activate the environment, run:
|
||||
For some systems, it may help to append the Conda environment library path to `$LD_LIBRARY_PATH`. The `install_third_party_dependencies.sh` script does this once, but you may need this for each bash instance.
|
||||
|
||||
```bash
|
||||
source scripts/activate_conda_env.sh
|
||||
```
|
||||
|
||||
To deactivate it, run:
|
||||
|
||||
```bash
|
||||
source scripts/deactivate_conda_env.sh
|
||||
```
|
||||
|
||||
With the environment active, compile OpenFold's CUDA kernels with
|
||||
|
||||
```bash
|
||||
python3 setup.py install
|
||||
```
|
||||
|
||||
To install the HH-suite to `/usr/bin`, run
|
||||
|
||||
```bash
|
||||
# scripts/install_hh_suite.sh
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
|
@ -232,28 +214,51 @@ efficent AlphaFold-Multimer more than double the time. Use the
|
|||
at once. The `run_pretrained_openfold.py` script can enable this config option with the
|
||||
`--long_sequence_inference` command line option
|
||||
|
||||
#### Single-Sequence Model Inference
|
||||
To run inference for a sequence using the single-sequence model, first you would need the ESM-1b embedding for the sequence. For this you need to set up the [ESM](https://www.github.com/facebookresearch/esm.git) model on your system. Once you have the the setup ready, use the following command in the ESM model directory to generate an embedding:
|
||||
#### SoloSeq Inference
|
||||
To run inference for a sequence using the SoloSeq single-sequence model, you can either precompute ESM-1b embeddings in bulk, or you can generate them during inference.
|
||||
|
||||
For generating ESM-1b embeddings in bulk, use the provided script: `scripts/precompute_embeddings.py`. The script takes a directory of FASTA files (one sequence per file) and generates ESM-1b embeddings in the same format and directory structure as required by SoloSeq. Following is an example command to use the script:
|
||||
|
||||
```bash
|
||||
cd <esm_dir>
|
||||
python scripts/extract.py esm1b_t33_650M_UR50S <fasta> output_dir --include per_tok
|
||||
python scripts/precompute_embeddings.py fasta_dir/ embeddings_output_dir/
|
||||
```
|
||||
|
||||
Once you have the `*.pt` embedding file, you can place it in that sequence's alignments directory (same as that used by the MSA model of OF). That is, inside the top-level alignments directory, there will be one subdirectory for each sequence you want to run inference on, like so: `alignments_dir/{sequence_id}/{sequence_id}.pt`. You can also place a `*.hhr` file in the same directory, which can contain the details about the structures that you want to use as templates.
|
||||
In the same per-label subdirectories inside `embeddings_output_dir`, you can also place `*.hhr` files (outputs from HHSearch), which can contain the details about the structures that you want to use as templates. If you do not place any such file, templates will not be used and only the ESM-1b embeddings will be used to predict the structure. If you want to use templates, you need to pass the PDB MMCIF dataset to the command.
|
||||
|
||||
Now, you are ready to run inference:
|
||||
```bash
|
||||
python run_pretrained_openfold.py \
|
||||
fasta_dir \
|
||||
data/pdb_mmcif/mmcif_files/ \
|
||||
--use_precomputed_alignments alignments_dir \
|
||||
--use_precomputed_alignments embeddings_output_dir \
|
||||
--output_dir ./ \
|
||||
--model_device "cuda:0" \
|
||||
--config_preset "seq_model_esm1b_ptm" \
|
||||
--openfold_checkpoint_path openfold/resources/openfold_params/seq_model_esm1b_ptm.pt
|
||||
```
|
||||
|
||||
For generating the embeddings during inference, skip the `--use_precomputed_alignments` argument. The `*.hhr` files will be generated as well if you pass the paths to the relevant databases and tools, as specified in the command below. If you skip the database and tool arguments, HHSearch will not be used to find templates and only generated ESM-1b embeddings will be used to predict the structure.
|
||||
```bash
|
||||
python3 run_pretrained_openfold.py \
|
||||
fasta_dir \
|
||||
data/pdb_mmcif/mmcif_files/ \
|
||||
--output_dir ./ \
|
||||
--model_device "cuda:0" \
|
||||
--config_preset "seq_model_esm1b_ptm" \
|
||||
--openfold_checkpoint_path openfold/resources/openfold_params/seq_model_esm1b_ptm.pt \
|
||||
--uniref90_database_path data/uniref90/uniref90.fasta \
|
||||
--pdb70_database_path data/pdb70/pdb70 \
|
||||
--jackhmmer_binary_path lib/conda/envs/openfold_venv/bin/jackhmmer \
|
||||
--hhsearch_binary_path lib/conda/envs/openfold_venv/bin/hhsearch \
|
||||
--kalign_binary_path lib/conda/envs/openfold_venv/bin/kalign \
|
||||
```
|
||||
|
||||
For generating template information, you will need the UniRef90 and PDB70 databases and the JackHmmer and HHSearch binaries.
|
||||
|
||||
SoloSeq allows you to use the same flags and optimizations as the MSA-based OpenFold. For example, you can skip relaxation using `--skip_relaxation`, save all model outputs using `--save_outputs`, and generate output files in MMCIF format using `--cif_output`.
|
||||
|
||||
**NOTE:** Due to the nature of the ESM-1b embeddings, the sequence length for inference using the SoloSeq model is limited to 1022 residues. Sequences longer than that will be truncated.
|
||||
|
||||
### Training
|
||||
|
||||
To train the model, you will first need to precompute protein alignments.
|
||||
|
@ -461,7 +466,7 @@ Please cite our paper:
|
|||
|
||||
```bibtex
|
||||
@article {Ahdritz2022.11.20.517210,
|
||||
author = {Ahdritz, Gustaf and Bouatta, Nazim and Kadyan, Sachin and Xia, Qinghui and Gerecke, William and O{\textquoteright}Donnell, Timothy J and Berenberg, Daniel and Fisk, Ian and Zanichelli, Niccolò and Zhang, Bo and Nowaczynski, Arkadiusz and Wang, Bei and Stepniewska-Dziubinska, Marta M and Zhang, Shang and Ojewole, Adegoke and Guney, Murat Efe and Biderman, Stella and Watkins, Andrew M and Ra, Stephen and Lorenzo, Pablo Ribalta and Nivon, Lucas and Weitzner, Brian and Ban, Yih-En Andrew and Sorger, Peter K and Mostaque, Emad and Zhang, Zhao and Bonneau, Richard and AlQuraishi, Mohammed},
|
||||
author = {Ahdritz, Gustaf and Bouatta, Nazim and Floristean, Christina and Kadyan, Sachin and Xia, Qinghui and Gerecke, William and O{\textquoteright}Donnell, Timothy J and Berenberg, Daniel and Fisk, Ian and Zanichelli, Niccolò and Zhang, Bo and Nowaczynski, Arkadiusz and Wang, Bei and Stepniewska-Dziubinska, Marta M and Zhang, Shang and Ojewole, Adegoke and Guney, Murat Efe and Biderman, Stella and Watkins, Andrew M and Ra, Stephen and Lorenzo, Pablo Ribalta and Nivon, Lucas and Weitzner, Brian and Ban, Yih-En Andrew and Sorger, Peter K and Mostaque, Emad and Zhang, Zhao and Bonneau, Richard and AlQuraishi, Mohammed},
|
||||
title = {{O}pen{F}old: {R}etraining {A}lpha{F}old2 yields new insights into its learning mechanisms and capacity for generalization},
|
||||
elocation-id = {2022.11.20.517210},
|
||||
year = {2022},
|
||||
|
|
|
@ -1,31 +1,36 @@
|
|||
name: openfold_venv
|
||||
name: openfold-venv
|
||||
channels:
|
||||
- conda-forge
|
||||
- bioconda
|
||||
- pytorch
|
||||
dependencies:
|
||||
- conda-forge::python=3.9
|
||||
- conda-forge::setuptools=59.5.0
|
||||
- conda-forge::pip
|
||||
- conda-forge::openmm=7.5.1
|
||||
- conda-forge::pdbfixer
|
||||
- conda-forge::cudatoolkit==11.3.*
|
||||
- python=3.9
|
||||
- libgcc=7.2
|
||||
- setuptools=59.5.0
|
||||
- pip
|
||||
- openmm=7.7
|
||||
- pdbfixer
|
||||
- cudatoolkit==11.3.*
|
||||
- pytorch-lightning==1.5.10
|
||||
- biopython==1.79
|
||||
- numpy==1.21
|
||||
- PyYAML==5.4.1
|
||||
- requests
|
||||
- scipy==1.7
|
||||
- tqdm==4.62.2
|
||||
- typing-extensions==3.10
|
||||
- wandb==0.12.21
|
||||
- modelcif==0.7
|
||||
- awscli
|
||||
- ml-collections
|
||||
- aria2
|
||||
- git
|
||||
- bioconda::hmmer==3.3.2
|
||||
- bioconda::hhsuite==3.3.0
|
||||
- bioconda::kalign2==2.04
|
||||
- pytorch::pytorch=1.12.*
|
||||
- pip:
|
||||
- biopython==1.79
|
||||
- deepspeed==0.5.10
|
||||
- dm-tree==0.1.6
|
||||
- ml-collections==0.1.0
|
||||
- numpy==1.21.2
|
||||
- PyYAML==5.4.1
|
||||
- requests==2.26.0
|
||||
- scipy==1.7.1
|
||||
- tqdm==4.62.2
|
||||
- typing-extensions==3.10.0.2
|
||||
- pytorch_lightning==1.5.10
|
||||
- wandb==0.12.21
|
||||
- modelcif==0.7
|
||||
- git+https://github.com/NVIDIA/dllogger.git
|
||||
- git+https://github.com/Dao-AILab/flash-attention.git@5b838a8
|
||||
|
|
|
@ -1,42 +0,0 @@
|
|||
Index: simtk/openmm/app/topology.py
|
||||
===================================================================
|
||||
--- simtk.orig/openmm/app/topology.py
|
||||
+++ simtk/openmm/app/topology.py
|
||||
@@ -356,19 +356,35 @@
|
||||
def isCyx(res):
|
||||
names = [atom.name for atom in res._atoms]
|
||||
return 'SG' in names and 'HG' not in names
|
||||
+ # This function is used to prevent multiple di-sulfide bonds from being
|
||||
+ # assigned to a given atom. This is a DeepMind modification.
|
||||
+ def isDisulfideBonded(atom):
|
||||
+ for b in self._bonds:
|
||||
+ if (atom in b and b[0].name == 'SG' and
|
||||
+ b[1].name == 'SG'):
|
||||
+ return True
|
||||
+
|
||||
+ return False
|
||||
|
||||
cyx = [res for res in self.residues() if res.name == 'CYS' and isCyx(res)]
|
||||
atomNames = [[atom.name for atom in res._atoms] for res in cyx]
|
||||
for i in range(len(cyx)):
|
||||
sg1 = cyx[i]._atoms[atomNames[i].index('SG')]
|
||||
pos1 = positions[sg1.index]
|
||||
+ candidate_distance, candidate_atom = 0.3*nanometers, None
|
||||
for j in range(i):
|
||||
sg2 = cyx[j]._atoms[atomNames[j].index('SG')]
|
||||
pos2 = positions[sg2.index]
|
||||
delta = [x-y for (x,y) in zip(pos1, pos2)]
|
||||
distance = sqrt(delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2])
|
||||
- if distance < 0.3*nanometers:
|
||||
- self.addBond(sg1, sg2)
|
||||
+ if distance < candidate_distance and not isDisulfideBonded(sg2):
|
||||
+ candidate_distance = distance
|
||||
+ candidate_atom = sg2
|
||||
+ # Assign bond to closest pair.
|
||||
+ if candidate_atom:
|
||||
+ self.addBond(sg1, candidate_atom)
|
||||
+
|
||||
+
|
||||
|
||||
class Chain(object):
|
||||
"""A Chain object represents a chain within a Topology."""
|
|
@ -28,18 +28,10 @@ import openfold.utils.loss as loss
|
|||
from openfold.np.relax import cleanup, utils
|
||||
import ml_collections
|
||||
import numpy as np
|
||||
try:
|
||||
# openmm >= 7.6
|
||||
import openmm
|
||||
from openmm import unit
|
||||
from openmm import app as openmm_app
|
||||
from openmm.app.internal.pdbstructure import PdbStructure
|
||||
except ImportError:
|
||||
# openmm < 7.6 (requires DeepMind patch)
|
||||
from simtk import openmm
|
||||
from simtk import unit
|
||||
from simtk.openmm import app as openmm_app
|
||||
from simtk.openmm.app.internal.pdbstructure import PdbStructure
|
||||
import openmm
|
||||
from openmm import unit
|
||||
from openmm import app as openmm_app
|
||||
from openmm.app.internal.pdbstructure import PdbStructure
|
||||
|
||||
ENERGY = unit.kilocalories_per_mole
|
||||
LENGTH = unit.angstroms
|
||||
|
|
|
@ -20,14 +20,8 @@ cases like removing chains of length one (see clean_structure).
|
|||
import io
|
||||
|
||||
import pdbfixer
|
||||
try:
|
||||
# openmm >= 7.6
|
||||
from openmm import app
|
||||
from openmm.app import element
|
||||
except ImportError:
|
||||
# openmm < 7.6 (requires DeepMind patch)
|
||||
from simtk.openmm import app
|
||||
from simtk.openmm.app import element
|
||||
from openmm import app
|
||||
from openmm.app import element
|
||||
|
||||
|
||||
def fix_pdb(pdbfile, alterations_info):
|
||||
|
|
|
@ -18,14 +18,8 @@ import io
|
|||
from openfold.np import residue_constants
|
||||
from Bio import PDB
|
||||
import numpy as np
|
||||
try:
|
||||
# openmm >= 7.6
|
||||
from openmm import app as openmm_app
|
||||
from openmm.app.internal.pdbstructure import PdbStructure
|
||||
except ImportError:
|
||||
# openmm < 7.6 (requires DeepMind patch)
|
||||
from simtk.openmm import app as openmm_app
|
||||
from simtk.openmm.app.internal.pdbstructure import PdbStructure
|
||||
from openmm import app as openmm_app
|
||||
from openmm.app.internal.pdbstructure import PdbStructure
|
||||
|
||||
|
||||
def overwrite_pdb_coordinates(pdb_str: str, pos) -> str:
|
||||
|
|
|
@ -55,6 +55,7 @@ from openfold.utils.trace_utils import (
|
|||
pad_feature_dict_seq,
|
||||
trace_model_,
|
||||
)
|
||||
from scripts.precompute_embeddings import EmbeddingGenerator
|
||||
from scripts.utils import add_data_args
|
||||
|
||||
|
||||
|
@ -82,6 +83,8 @@ def precompute_alignments(tags, seqs, alignment_dir, args):
|
|||
pdb70_database_path=args.pdb70_database_path,
|
||||
no_cpus=args.cpus,
|
||||
)
|
||||
embedding_generator = EmbeddingGenerator()
|
||||
embedding_generator.run(tmp_fasta_path, alignment_dir)
|
||||
else:
|
||||
alignment_runner = data_pipeline.AlignmentRunner(
|
||||
jackhmmer_binary_path=args.jackhmmer_binary_path,
|
||||
|
@ -373,7 +376,7 @@ if __name__ == "__main__":
|
|||
help="""Postfix for output prediction filenames"""
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data_random_seed", type=str, default=None
|
||||
"--data_random_seed", type=int, default=None
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip_relaxation", action="store_true", default=False,
|
||||
|
|
|
@ -1,49 +1,18 @@
|
|||
#!/bin/bash
|
||||
CONDA_INSTALL_URL=${CONDA_INSTALL_URL:-"https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh"}
|
||||
|
||||
source scripts/vars.sh
|
||||
|
||||
# Install Miniconda locally
|
||||
rm -rf lib/conda
|
||||
rm -f /tmp/Miniconda3-latest-Linux-x86_64.sh
|
||||
wget -P /tmp \
|
||||
"${CONDA_INSTALL_URL}" \
|
||||
&& bash /tmp/Miniconda3-latest-Linux-x86_64.sh -b -p lib/conda \
|
||||
&& rm /tmp/Miniconda3-latest-Linux-x86_64.sh
|
||||
|
||||
# Grab conda-only packages
|
||||
export PATH=lib/conda/bin:$PATH
|
||||
lib/conda/bin/python3 -m pip install nvidia-pyindex
|
||||
conda env create --name=${ENV_NAME} -f environment.yml
|
||||
source scripts/activate_conda_env.sh
|
||||
|
||||
echo "Attempting to install FlashAttention"
|
||||
git clone https://github.com/HazyResearch/flash-attention
|
||||
CUR_DIR=$PWD
|
||||
cd flash-attention
|
||||
git checkout 5b838a8bef
|
||||
python3 setup.py install
|
||||
cd $CUR_DIR
|
||||
|
||||
# Install DeepMind's OpenMM patch
|
||||
OPENFOLD_DIR=$PWD
|
||||
pushd lib/conda/envs/$ENV_NAME/lib/python3.9/site-packages/ \
|
||||
&& patch -p0 < $OPENFOLD_DIR/lib/openmm.patch \
|
||||
&& popd
|
||||
|
||||
# Download folding resources
|
||||
wget --no-check-certificate -P openfold/resources \
|
||||
wget -N --no-check-certificate -P openfold/resources \
|
||||
https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt
|
||||
|
||||
# Certain tests need access to this file
|
||||
mkdir -p tests/test_data/alphafold/common
|
||||
ln -rs openfold/resources/stereo_chemical_props.txt tests/test_data/alphafold/common
|
||||
|
||||
echo "Downloading OpenFold parameters..."
|
||||
bash scripts/download_openfold_params.sh openfold/resources
|
||||
|
||||
echo "Downloading AlphaFold parameters..."
|
||||
bash scripts/download_alphafold_params.sh openfold/resources
|
||||
|
||||
# Decompress test data
|
||||
gunzip -c tests/test_data/sample_feats.pickle.gz > tests/test_data/sample_feats.pickle
|
||||
|
||||
python setup.py install
|
||||
|
||||
export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
|
||||
# This setting is used to fix a worker assignment issue during data loading
|
||||
conda env config vars set KMP_AFFINITY=none
|
||||
|
|
|
@ -0,0 +1,200 @@
|
|||
# Some functions borrowed from [ESM](https://www.github.com/facebookresearch/esm)
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
|
||||
import torch
|
||||
|
||||
from openfold.data import parsers
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
class SequenceDataset(object):
|
||||
def __init__(self, labels, sequences) -> None:
|
||||
self.labels = labels
|
||||
self.sequences = sequences
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, fasta_file):
|
||||
labels, sequences = [], []
|
||||
|
||||
with open(fasta_file, "r") as infile:
|
||||
fasta_str = infile.read()
|
||||
sequences, labels = parsers.parse_fasta(fasta_str)
|
||||
|
||||
assert len(set(labels)) == len(labels),\
|
||||
"Sequence labels need to be unique. Duplicates found!"
|
||||
|
||||
return cls(labels, sequences)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.labels)
|
||||
|
||||
def __getitem__(self, idx):
|
||||
return self.labels[idx], self.sequences[idx]
|
||||
|
||||
def get_batch_indices(self, toks_per_batch, extra_toks_per_seq):
|
||||
sizes = [(len(s), i) for i, s in enumerate(self.sequences)]
|
||||
sizes.sort()
|
||||
batches = []
|
||||
buf = []
|
||||
max_len = 0
|
||||
|
||||
def _flush_current_buf():
|
||||
nonlocal max_len, buf
|
||||
if len(buf) == 0:
|
||||
return
|
||||
batches.append(buf)
|
||||
buf = []
|
||||
max_len = 0
|
||||
|
||||
for sz, i in sizes:
|
||||
sz += extra_toks_per_seq
|
||||
if max(sz, max_len) * (len(buf)+1) > toks_per_batch:
|
||||
_flush_current_buf()
|
||||
max_len = max(max_len, sz)
|
||||
buf.append(i)
|
||||
|
||||
_flush_current_buf()
|
||||
return batches
|
||||
|
||||
|
||||
class EmbeddingGenerator:
|
||||
"""Generates the ESM-1b embeddings for the single sequence model"""
|
||||
def __init__(self,
|
||||
toks_per_batch: int = 4096,
|
||||
truncate: bool = True,
|
||||
use_local_esm: str = None,
|
||||
nogpu: bool = False,
|
||||
):
|
||||
self.toks_per_batch = toks_per_batch
|
||||
self.truncate = truncate
|
||||
self.use_local_esm = use_local_esm
|
||||
self.nogpu = nogpu
|
||||
|
||||
# Generate embeddings in bulk
|
||||
if self.use_local_esm:
|
||||
self.model, self.alphabet = torch.hub.load(self.use_local_esm, "esm1b_t33_650M_UR50S", source='local')
|
||||
else:
|
||||
self.model, self.alphabet = torch.hub.load("facebookresearch/esm:main", "esm1b_t33_650M_UR50S")
|
||||
if torch.cuda.is_available() and not self.nogpu:
|
||||
self.model = self.model.to(device="cuda")
|
||||
|
||||
def parse_sequences(self, fasta_dir, output_dir):
|
||||
labels = []
|
||||
seqs = []
|
||||
|
||||
# Generate a single bulk file
|
||||
for f in os.listdir(fasta_dir):
|
||||
f_name, ext = os.path.splitext(f)
|
||||
if ext != '.fasta' and ext != '.fa':
|
||||
logging.warning(f"Ignoring non-FASTA file: {f}")
|
||||
continue
|
||||
with open(os.path.join(fasta_dir, f), 'r') as infile:
|
||||
seq = infile.readlines()[1].strip()
|
||||
labels.append(f_name)
|
||||
seqs.append(seq)
|
||||
|
||||
lines = []
|
||||
for label, seq in zip(labels, seqs):
|
||||
lines += f'>{label}\n'
|
||||
lines += f'{seq}\n'
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
temp_fasta_file = os.path.join(output_dir, 'temp.fasta')
|
||||
with open(temp_fasta_file, 'w') as outfile:
|
||||
outfile.writelines(lines)
|
||||
return temp_fasta_file
|
||||
|
||||
def run(
|
||||
self,
|
||||
fasta_file,
|
||||
output_dir,
|
||||
):
|
||||
|
||||
dataset = SequenceDataset.from_file(fasta_file)
|
||||
batches = dataset.get_batch_indices(self.toks_per_batch, extra_toks_per_seq=1)
|
||||
data_loader = torch.utils.data.DataLoader(
|
||||
dataset, collate_fn=self.alphabet.get_batch_converter(), batch_sampler=batches
|
||||
)
|
||||
logging.info("Loaded all sequences")
|
||||
repr_layers = [33]
|
||||
|
||||
with torch.no_grad():
|
||||
for batch_idx, (labels, strs, toks) in enumerate(data_loader):
|
||||
logging.info(f"Processing {batch_idx + 1} of {len(batches)} batches ({toks.size(0)} sequences)")
|
||||
if torch.cuda.is_available() and not self.nogpu:
|
||||
toks = toks.to(device="cuda", non_blocking=True)
|
||||
|
||||
if self.truncate:
|
||||
toks = toks[:1022]
|
||||
|
||||
out = self.model(toks, repr_layers=repr_layers, return_contacts=False)
|
||||
|
||||
representations = {
|
||||
33: out["representations"][33].to(device="cpu")
|
||||
}
|
||||
|
||||
for i, label in enumerate(labels):
|
||||
os.makedirs(os.path.join(output_dir, label), exist_ok=True)
|
||||
result = {"label": label}
|
||||
|
||||
result["representations"] = {
|
||||
33: representations[33][i, 1: len(strs[i]) + 1].clone()
|
||||
}
|
||||
torch.save(
|
||||
result,
|
||||
os.path.join(output_dir, label, label+".pt")
|
||||
)
|
||||
|
||||
|
||||
|
||||
def main(args):
|
||||
logging.info("Loading the model...")
|
||||
embedding_generator = EmbeddingGenerator(
|
||||
args.toks_per_batch,
|
||||
args.truncate,
|
||||
args.use_local_esm,
|
||||
args.nogpu)
|
||||
logging.info("Loading the sequences and running the inference...")
|
||||
temp_fasta_file = embedding_generator.parse_sequences(
|
||||
args.fasta_dir,
|
||||
args.output_dir
|
||||
)
|
||||
embedding_generator.run(
|
||||
temp_fasta_file,
|
||||
args.output_dir
|
||||
)
|
||||
os.remove(temp_fasta_file)
|
||||
logging.info("Completed.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"fasta_dir", type=str,
|
||||
help="""Path to directory containing FASTA files."""
|
||||
)
|
||||
parser.add_argument(
|
||||
"output_dir", type=str,
|
||||
help="Directory in which to output embeddings"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--toks_per_batch", type=int, default=4096,
|
||||
help="maximum tokens in a batch"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--truncate", action="store_true", default=True,
|
||||
help="Truncate sequences longer than 1022 (ESM restriction). Default: True"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use_local_esm", type=str, default=None,
|
||||
help="Use a local ESM repository instead of cloning from Github"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--nogpu", action="store_true",
|
||||
help="Do not use GPU"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
main(args)
|
Loading…
Reference in New Issue