Source code for stcrpy.tcr_processing.annotate

"""
Created on 10 May 2017
@author: leem

Implementation to call anarci (built-in to STrDab) to annotate structures.
"""
import sys
import warnings

from Bio.PDB.Polypeptide import aa1, aa3  # to allow me to return "X" if not found.

to_one_letter_code = dict(list(zip(aa3, aa1)))

# Import TCRDB's constants and common functions.
from .utils.constants import TCR_CHAINS



[docs]
def call_anarci(
    seq,
    allow=set(
        [
            "B",
            "A",
            "D",
            "G",
            "GA1",
            "GA2",
            "GA1L",
            "GA2L",
            "GA",
            "GB",
            "B2M",
            "MH1",
            "MR1",
            "MR2",
        ]
    ),
):
    """
    Use the ANARCI program to number the sequence.

    Args:
        seq: An amino acid sequence that you wish to number.

    Returns:
        numbering, chain type, germline information
    """
    from anarci import number as anarci_number

    numbering, chain_type, germline_info = anarci_number(
        seq, allow=allow, assign_germline=True
    )

    if numbering and chain_type in allow:
        return [(_, aa) for _, aa in numbering if aa != "-"], chain_type, germline_info
    elif numbering and chain_type in ["BA", "GD", "AB", "DG"]:
        return (
            [[(_, aa) for _, aa in n if aa != "-"] for n in numbering],
            chain_type,
            germline_info,
        )
    else:
        return False, False, False




[docs]
def annotate(chain):
    """
    Annotate the sequence of a chain object from TCRDB.TcrPDB
    # e.g. if you have chains B, A and X, you want to force the annotator to return the annotation
    # for B and A but not for X (the antigen)

    returns a dictionary which has the residue ids as key and the annotation as value or is False,
    and chain type which is B/A/G/D/MH1/GA/GB/B2M or False.
    """
    sequence_list, sequence_str = extract_sequence(chain)
    numbering, chain_type, germline_info = call_anarci(sequence_str)

    # Use
    if chain_type:
        chtype = "".join(sorted(chain_type, reverse=True))
    else:
        chtype = False

    if chtype in ("BA", "GD"):
        aligned_numbering = align_scTCR_numbering(
            numbering, sequence_list, sequence_str
        )
        # aligned_numbering = cleanup_scTCR_numbering(aligned_numbering, sequence_list)
        scTCR = True
    # elif chtype == "DC1" or chtype == "RM1":
    #     # Use the scTCR numbering trick; since CD1/MR1 numbering only spans up to residue ~87 and
    #     aligned_numbering = align_scTCR_numbering(
    #         numbering, sequence_list, sequence_str
    #     )
    #     aligned_numbering[0].update(aligned_numbering[1])
    #     aligned_numbering = aligned_numbering[0]  # combine the numbering
    #     aligned_numbering = cleanup_scTCR_numbering(aligned_numbering, sequence_list)
    #     scTCR = False
    else:
        # align the original residue id's to the numbering
        aligned_numbering = align_numbering(numbering, sequence_list)
        scTCR = False

    # aligned numbering is a dictionary of the original residue ids and the new numbering
    return aligned_numbering, chain_type, germline_info, scTCR




[docs]
def extract_sequence(
    chain, selection=False, return_warnings=False, ignore_hets=False, backbone=False
):
    """
    Get the amino acid sequence of the chain.
    Residues containing HETATOMs are skipped -->  Residues containing HETATOMs are checked as an amino acid.

    Residues containing HETATOMs are checked  to be amino acids and the single letter returned.

    This works provided the residues in the chain are in the correct order.

    Args:
        selection: a selection object to select certain residues
        return_warnings: Flag to return a list of warnings or not
        backbone: Flag whether to only show residues with a complete backbone (in the structure) or not.
    Returns:
        The sequence in a resid:aa tuple list and the sequence as a string.

    """
    sequence_list = []
    warnings = []
    for residue in chain.get_list():
        if (
            residue.id[0] != " "
        ):  # skip HETATOMs - this is not necesserily a good idea, flag to the user that is has been done.
            #            if residue.get_resname() not in to_one_letter_code: # Check that the residue can be converted into a single letter.
            #                continue
            #            if residue.get_resname() in to_one_letter_code: # Check that the residue can be converted into a single letter.
            #                pass
            if residue.get_resname() in to_one_letter_code:
                if ignore_hets:
                    if return_warnings:
                        warnings.append(
                            """Warning: HETATM residue %s at position %s (PDB numbering) found in chain %s.
                            Not including it in structure's sequence."""
                            % (
                                residue.get_resname(),
                                str(residue.id[1]) + residue.id[2].strip(),
                                residue.parent.id,
                            )
                        )
                    else:
                        sys.stderr.write(
                            """Warning: HETATM residue %s position %s (PDB numbering) found in chain %s.
                            Not including it in structure's sequence.\n"""
                            % (
                                residue.get_resname(),
                                str(residue.id[1]) + residue.id[2].strip(),
                                residue.parent.id,
                            )
                        )
                    continue
            else:
                continue

        if selection:
            if not selection.accept(residue):
                continue

        atoms_of_residue = list(residue.child_dict.keys())
        backboneCondition = (
            "N" in atoms_of_residue
            and "C" in atoms_of_residue
            and "CA" in atoms_of_residue
            and "O" in atoms_of_residue
        )  # Boolean to hold if residue has a full backbone

        # CASE 1: backbone = True, and residue has a full backbone; convert a.a into single letter
        if backbone and backboneCondition:
            sequence_list.append(
                (residue.id, to_one_letter_code.get(residue.get_resname(), "X"))
            )
        # CASE 2: backbone = True, but residue does not have a full backbone; use a gap in sequence annotation
        elif backbone and not backboneCondition:
            sequence_list.append((residue.id, "-"))
        # CASE 0 (default): don't care about backbone, just write it to sequence if it's found in structure.
        elif not backbone:
            sequence_list.append(
                (residue.id, to_one_letter_code.get(residue.get_resname(), "X"))
            )  # i am

    sequence_str = "".join([r[1] for r in sequence_list])
    if not return_warnings:
        return sequence_list, sequence_str
    else:
        return sequence_list, sequence_str, warnings




[docs]
def interpret(x):
    """
    Function to interpret an annotation in the form H100A into the form ( 100, 'A' )
    """
    assert x[0] in TCR_CHAINS, x
    try:
        return (int(x[1:]), " ")
    except ValueError:
        return (int(x[1:-1]), x[-1])




[docs]
def align_numbering(numbering, sequence_list, alignment_dict={}):
    """
    Align the sequence that has been numbered to the sequence you input.
    The numbered sequence should be "in" the input sequence.
    If not, supply an alignment dictionary.(align sequences and use get_alignment_dict(ali1,ali2))
    """
    if numbering:
        numbered_sequence = "".join([r[1] for r in numbering])
        input_sequence = "".join([r[1] for r in sequence_list])
        if not alignment_dict:
            try:
                numbered_sequence_ali, input_sequence_ali = pairwise_alignment(
                    numbered_sequence, input_sequence
                )
                alignment_dict = get_alignment_dict(
                    input_sequence_ali, numbered_sequence_ali
                )
            except Exception:
                raise Exception(
                    "Could not align numbered sequence to aligned sequence:"
                    + " "
                    + str(numbered_sequence)
                    + " "
                    + str(input_sequence)
                )

        aligned_numbering = {}
        n = -1
        after_flag = False
        for i in range(len(input_sequence)):
            if i in alignment_dict:
                # during
                assert (
                    after_flag is False
                ), "Extra residue in structure than expected from provided sequence"
                assert (
                    input_sequence[i] == numbered_sequence[alignment_dict[i]]
                ), "alignment dictionary failed"
                aligned_numbering[sequence_list[i][0]] = numbering[alignment_dict[i]][0]
                n = numbering[-1][0][0] + 1
            elif n > -1:
                # after
                after_flag = True
                aligned_numbering[sequence_list[i][0]] = (n, " ")
                n += 1
            else:
                # before numbering
                aligned_numbering[sequence_list[i][0]] = ""

        return aligned_numbering
    else:
        return False




[docs]
def align_scTCR_numbering(numbering, sequence_list, sequence_str):
    """
    Align the sequence that has been numbered to a scTCR structure.
    Args:
        numbering:     numbered list of residues; this is usually a two-element list/tuple from TCRDB.anarci.number
        sequence_list: list of residues (e.g. from a structure) in its original numbering
        sequence_str:  string form of sequence_list
    """
    if numbering:
        numbered_sequence = ["".join([r[1] for r in n]) for n in numbering]
        input_sequence = sequence_str

        aligned_numbering = [{}, {}]

        for ii, a_sequence in enumerate(numbered_sequence):

            # Align each of the joined sequences from the numbering into the target structure sequence in "sequence_str"
            try:
                a_sequence_ali, input_sequence_ali = pairwise_alignment(
                    a_sequence, input_sequence
                )
                alignment_dict = get_alignment_dict(input_sequence_ali, a_sequence_ali)
            except Exception:
                raise Exception(
                    "Could not align numbered sequence to aligned sequence"
                    + "\n"
                    + str(numbered_sequence)
                    + "\n"
                    + str(input_sequence)
                )

            n = -1
            after_flag = False
            # for i in xrange(len(input_sequence)):
            for i in alignment_dict:
                if i in alignment_dict:
                    # during
                    assert (
                        after_flag is False
                    ), "Extra residue in structure than expected from provided sequence"
                    assert (
                        input_sequence[i] == numbered_sequence[ii][alignment_dict[i]]
                    ), "alignment dictionary failed"
                    aligned_numbering[ii][sequence_list[i][0]] = numbering[ii][
                        alignment_dict[i]
                    ][0]
                    n = numbering[ii][-1][0][0] + 1
                elif n > -1:
                    # after
                    after_flag = True
                    aligned_numbering[ii][sequence_list[i][0]] = (n, " ")
                    n += 1
                else:
                    # before numbering
                    aligned_numbering[ii][sequence_list[i][0]] = ""

        return aligned_numbering
    else:
        return False




[docs]
def cleanup_scTCR_numbering(numbering_dict, sequence_list):
    """
    The scTCR numbering method, while useful for sequences with two domains,
    can have gaps in between (e.g. CD1 molecule of 4lhu).
    This is to close the gaps in the numbering so that residues that were unnumbered by anarci don't move around
    during structural parsing (when they're probably just connections between domains).

    Args:
        numbering_dict: numbered dictionary from align_scTCR_numbering
        sequence_list : sequence list from the structure for alignment.
    """
    positions = [p[0] for p in sequence_list]

    # This gets the last numbered residue in numbering_dict
    lastkey = max(numbering_dict)
    lastidx = positions.index(lastkey)  # Where is this on sequence_list?

    for index in range(1, len(positions)):

        # If we got to the last key, don't bother.
        if index > lastidx:
            break

        key = positions[index]

        # If a target key is not in the numbering dict, see where it fits, then fit a number in it.
        if key not in numbering_dict:

            # Get the left and right bounds of the gap
            left, right = False, False
            lidx, ridx = 0, 0
            lval = (0, " ")
            j = 0

            # Continue iterating left from the missing key until we find one that exists
            while not left:
                key_left = positions[index - j]
                if key_left in numbering_dict:
                    left = True
                    lidx = (
                        index - j
                    )  # Last known index of sequence_list where we know a key exists
                    lval = numbering_dict[key_left]
                else:
                    j += 1

            j = 0
            while not right:
                key_right = positions[index + j]
                if key_right in numbering_dict:
                    right = True
                    ridx = (
                        index + j
                    )  # Last known index of sequence_list on the right where we know a key exists
                else:
                    j += 1

            # For every key between the left and right, fill in
            for k, missing_key in enumerate(positions[lidx + 1 : ridx]):
                numbering_dict[missing_key] = (lval[0] + k + 1, " ")

    return numbering_dict




[docs]
def get_alignment_dict(ali1, ali2):
    """
    Get a dictionary which tells you the index in sequence 2 that should align with the index in sequence 1 (key)

    ali1:  ----bcde-f---        seq1: bcdef
    ali2:  ---abcd--f---        seq2: abcdf

    alignment_dict={
        0:1,
        1:2,
        2:3,
        4:4
        }

    If the index is aligned with a gap do not include in the dictionary.
    e.g  1 in alignment_dict  --> True
    e.g  3 in alignment_dict  --> False
    """
    assert len(ali1) == len(
        ali2
    ), "aligned sequences must be same lengths (including gaps)"
    alignment_dict = {}
    p1 = -1
    p2 = -1
    for ap in range(len(ali1)):
        if ali1[ap] != "-" and ali2[ap] != "-":
            p1 += 1
            p2 += 1
            alignment_dict[p1] = p2
        elif ali1[ap] != "-":
            p1 += 1
        elif ali2[ap] != "-":
            p2 += 1
    return alignment_dict




[docs]
def pairwise_alignment(seq1, seq2, exact=False):
    """
    Function to do alignment of sequences between sequences using biopython.
    """
    with warnings.catch_warnings():  # prevents pairwise2 deprecation warning from being raised
        warnings.simplefilter("ignore")
        from Bio.pairwise2 import align

    alignment = None
    s1_aln, s2_aln = easy_alignment(seq1, seq2)
    if s1_aln:
        return s1_aln, s2_aln

    if exact:
        # Align with a match score of 1, mismatch of 0, gap opening of -1.001, and gap extension of -1
        alignment = align.globalms(seq1, seq2, 1, 0, -1, -1.001)
    else:
        alignment = align.globalxx(seq1, seq2)

    if alignment:
        aligned_seqs = alignment[0]
        return aligned_seqs[0], aligned_seqs[1]
    else:
        return False, False




[docs]
def easy_alignment(seq1, seq2):
    """
    Function to align two sequences by checking if one is in the other.
    This function will conserve gaps.
    """
    assert (
        type(seq1) is str and type(seq2) is str
    ), "Sequences must be strings for easy_alignment"
    if seq1 in seq2:
        start = seq2.index(seq1)
        seq1_ali = "-" * start + seq1 + "-" * (len(seq2) - start - len(seq1))
        return seq1_ali, seq2

    elif seq2 in seq1:
        start = seq1.index(seq2)
        seq2_ali = "-" * start + seq2 + "-" * (len(seq1) - start - len(seq2))
        return seq1, seq2_ali

    else:
        # Can't align them # I return just one value here.
        return False, False




[docs]
def validate_sequence(seq):
    """
    Check whether a sequence is a protein sequence or if someone has submitted something nasty.
    """
    if len(seq) > 10000:
        raise AssertionError("Sequence too long.")
    if any([1 for s in seq.upper() if s not in aa1]):
        raise AssertionError(
            "Unknown amino acid letter found in sequence: " + seq.upper()
        )
    else:
        return True



if __name__ == "__main__":
    pass