openfold/scripts/prep_proteinnet_msas.py

67 lines
1.9 KiB
Python

import argparse
import logging
import os
import shutil
def main(args):
count = 0
max_count = args.max_count if args.max_count is not None else -1
msas = sorted(f for f in os.listdir(args.msa_dir))
mmcifs = sorted(f for f in os.listdir(args.mmcif_dir))
mmcif_idx = 0
for f in msas:
if(count == max_count):
break
path = os.path.join(args.msa_dir, f)
name = os.path.splitext(f)[0]
spl = name.upper().split('_')
if(len(spl) != 3):
continue
pdb_id, _, chain_id = spl
while pdb_id > os.path.splitext(mmcifs[mmcif_idx])[0].upper():
mmcif_idx += 1
# Only consider files with matching mmCIF files
if(pdb_id == os.path.splitext(mmcifs[mmcif_idx])[0].upper()):
dirname = os.path.join(args.out_dir, '_'.join([pdb_id, chain_id]))
os.makedirs(dirname, exist_ok=True)
dest = os.path.join(dirname, f)
if(args.copy):
shutil.copyfile(path, dest)
else:
os.rename(path, dest)
count += 1
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=
"Converts raw ProteinNet MSAs into a format recognized by the parser"
)
parser.add_argument(
"msa_dir", type=str, help="Directory containing ProteinNet MSAs"
)
parser.add_argument(
"mmcif_dir", type=str, help="Directory containing PDB mmCIFs"
)
parser.add_argument(
"out_dir", type=str,
help="Directory to which output should be saved"
)
parser.add_argument(
"--copy", type=bool, default=True,
help="Whether to copy the MSAs to out_dir rather than moving them"
)
parser.add_argument(
"--max_count", type=int, default=None,
help="A bound on the number of MSAs to process"
)
args = parser.parse_args()
main(args)