Source code for stcrpy.tcr_formats.tcr_formats

from Bio.SeqUtils import seq1
import json
import os



[docs]
def to_AF3_json(
    tcr: "TCR",
    tcr_only: bool = True,
    save: bool = True,
    save_dir: str = "",
    name: str = None,
    V_domain_only: bool = False,
) -> dict:
    """Converts TCR object to dict in Alphafold 3 JSON input format, ie. amino acid sequences.
    Eg:
    {
        "name": Job name,
        "modelSeeds": [],
        "sequences": [
            {"proteinChain": {"sequence": AAAAAAAAAAAAAA, "count": 1}},
            {"proteinChain": {"sequence": AAAAAAAAAAAAAA, "count": 1}},
            {"proteinChain": {"sequence": AAAAAAAAAAAAAA, "count": 1}},
        ],
    }

    Args:
        tcr (TCR): TCR structure object
        tcr_only (bool, optional): Whether to include TCR sequence only, excluding antigen and MHC. Defaults to True.
        save (bool, optional): Whether to save dict as JSON file. Defaults to True.
        save_dir (str, optional): Directory to save JSON files to. Defaults to "".
        name (str, optional): TCR ID to use as name for AF3 job. Defaults to None.
        V_domain_only (bool, optional): Include full TCR sequence or only the variable domain (1-128 IMGT numbering). Defaults to False.

    Returns:
        dict: Nested dictionary of AF3 sequence inputs.
    """
    if V_domain_only:
        residue_nrs = list(range(128))
    else:
        residue_nrs = None
    tcr_sequences = get_sequences(tcr, residues_to_include=residue_nrs)
    if not tcr_only:
        if len(tcr.get_MHC()) > 0:
            mhc_sequences = get_sequences(tcr.get_MHC()[0])
            tcr_sequences.update(mhc_sequences)

        if len(tcr.get_antigen()) > 0:
            antigen_sequence = get_sequences(tcr.get_antigen()[0])
            tcr_sequences.update(antigen_sequence)
    name = name if name is not None else f"{tcr.parent.parent.id}_{tcr.id}"
    tcr_json = {
        "name": name,
        "modelSeeds": [],
        "sequences": [
            {"proteinChain": {"sequence": seq, "count": 1}}
            for _, seq in tcr_sequences.items()
        ],
    }
    if save:
        path = os.path.join(save_dir, f"{name}.json")
        with open(path, "w") as f:
            json.dump(tcr_json, f)
    return tcr_json




[docs]
def get_sequences(
    entity: "Bio.PDB.Entity",
    amino_acids_only: bool = True,
    residues_to_include: list = None,
) -> dict:
    """Extract seqeunces from strcuture objects as dictionary.

    Args:
        entity (Bio.PDB.Entity): Stucture object
        amino_acids_only (bool, optional): Whether to remove non-amino acid 'X' from sequences. Defaults to True.
        residues_to_include (list, optional): List of residue IDs to include in sequence. Defaults to None.

    Raises:
        e: AttributeError if entity has no attribute .get_chains(). The assuems entity is chain level and returns single sequence

    Returns:
        dict: Dictionary of amino acid sequences, keyed by chain ID in strcuctre entity.
    """

    if residues_to_include is None:

        def residue_filter(res):
            return True

    else:

        def residue_filter(res):
            return res.id[1] in residues_to_include
    try:
        sequences = {
            chain.id: seq1(
                "".join(residue.resname for residue in chain if residue_filter(residue))
            )
            for chain in entity.get_chains()
        }
    except AttributeError as e:
        if entity.level == "C":
            sequences = {
                entity.id: seq1(
                    "".join(
                        residue.resname for residue in entity if residue_filter(residue)
                    )
                )
            }
        else:
            raise e
    if amino_acids_only:
        sequences = {k: seq.replace("X", "") for k, seq in sequences.items()}
    return sequences