openfold/scripts/expand_alignment_duplicates.py

"""
The OpenProteinSet alignment database is non-redundant, meaning that it only
stores one explicit representative alignment directory for all PDB chains in a
100% sequence identity cluster. In order to add explicit alignments for all PDB
chains, this script will add the missing chain directories and symlink them to
their representative alignment directories. This is required in order to train
OpenFold on the full PDB, not just one representative chain per cluster.
"""

from argparse import ArgumentParser
from pathlib import Path

from tqdm import tqdm


def create_duplicate_dirs(duplicate_chains: list[list[str]], alignment_dir: Path):
    """
    Create duplicate directory symlinks for all chains in the given duplicate lists.

    Args:
        duplicate_lists (list[list[str]]): A list of lists, where each inner list
            contains chains that are 100% sequence identical.
        alignment_dir (Path): Path to flattened alignment directory, with one
            subdirectory per chain.
    """
    print("Creating duplicate directory symlinks...")
    dirs_created = 0
    for chains in tqdm(duplicate_chains):
        # find the chain that has an alignment
        for chain in chains:
            if (alignment_dir / chain).exists():
                representative_chain = chain
                break
        else:
            print(f"No representative chain found for {chains}, skipping...")
            continue

        # create symlinks for all other chains
        for chain in chains:
            if chain != representative_chain:
                target_path = alignment_dir / chain
                if target_path.exists():
                    print(f"Chain {chain} already exists, skipping...")
                else:
                    (target_path).symlink_to(alignment_dir / representative_chain)
                    dirs_created += 1

    print(f"Created directories for {dirs_created} duplicate chains.")


def main(alignment_dir: Path, duplicate_chains_file: Path):
    # read duplicate chains file
    with open(duplicate_chains_file, "r") as fp:
        duplicate_chains = [list(line.strip().split()) for line in fp]

    # convert to absolute path for symlink creation
    alignment_dir = alignment_dir.resolve()

    create_duplicate_dirs(duplicate_chains, alignment_dir)


if __name__ == "__main__":
    parser = ArgumentParser(description=__doc__)
    parser.add_argument(
        "alignment_dir",
        type=Path,
        help="""Path to flattened alignment directory, with one subdirectory
                per chain.""",
    )
    parser.add_argument(
        "duplicate_chains_file",
        type=Path,
        help="""Path to file containing duplicate chains, where each line
                contains a space-separated list of chains that are 100%%
                sequence identical.
                """,
    )
    args = parser.parse_args()
    main(args.alignment_dir, args.duplicate_chains_file)