67 lines
1.9 KiB
Python
67 lines
1.9 KiB
Python
import argparse
|
|
import logging
|
|
import os
|
|
import shutil
|
|
|
|
|
|
def main(args):
|
|
count = 0
|
|
max_count = args.max_count if args.max_count is not None else -1
|
|
msas = sorted(f for f in os.listdir(args.msa_dir))
|
|
mmcifs = sorted(f for f in os.listdir(args.mmcif_dir))
|
|
mmcif_idx = 0
|
|
for f in msas:
|
|
if(count == max_count):
|
|
break
|
|
|
|
path = os.path.join(args.msa_dir, f)
|
|
name = os.path.splitext(f)[0]
|
|
spl = name.upper().split('_')
|
|
if(len(spl) != 3):
|
|
continue
|
|
|
|
pdb_id, _, chain_id = spl
|
|
|
|
while pdb_id > os.path.splitext(mmcifs[mmcif_idx])[0].upper():
|
|
mmcif_idx += 1
|
|
|
|
# Only consider files with matching mmCIF files
|
|
if(pdb_id == os.path.splitext(mmcifs[mmcif_idx])[0].upper()):
|
|
dirname = os.path.join(args.out_dir, '_'.join([pdb_id, chain_id]))
|
|
os.makedirs(dirname, exist_ok=True)
|
|
dest = os.path.join(dirname, f)
|
|
if(args.copy):
|
|
shutil.copyfile(path, dest)
|
|
else:
|
|
os.rename(path, dest)
|
|
|
|
count += 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description=
|
|
"Converts raw ProteinNet MSAs into a format recognized by the parser"
|
|
)
|
|
parser.add_argument(
|
|
"msa_dir", type=str, help="Directory containing ProteinNet MSAs"
|
|
)
|
|
parser.add_argument(
|
|
"mmcif_dir", type=str, help="Directory containing PDB mmCIFs"
|
|
)
|
|
parser.add_argument(
|
|
"out_dir", type=str,
|
|
help="Directory to which output should be saved"
|
|
)
|
|
parser.add_argument(
|
|
"--copy", type=bool, default=True,
|
|
help="Whether to copy the MSAs to out_dir rather than moving them"
|
|
)
|
|
parser.add_argument(
|
|
"--max_count", type=int, default=None,
|
|
help="A bound on the number of MSAs to process"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
main(args)
|