Source code for stcrpy.tcr_processing.TCRParser

"""
Created on 3 April 2024
@author: Nele Quast, based on leem

TCRParser object which is based on ABDB's AntibodyParser and BioPython's PDB parser.
"""

from itertools import combinations, product
import sys
import os
from collections import defaultdict
import warnings

from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB import NeighborSearch

# TCRDB
from .annotate import annotate, extract_sequence, align_numbering

from ..utils.error_stream import ErrorStream

from .TCRStructure import TCRStructure
from .Model import Model
from .TCR import TCR, abTCR, gdTCR
from .MHC import MHC, MH1, MH2, CD1, MR1, scMH1, scCD1, scMH2
from .Holder import Holder
from .TCRchain import TCRchain
from .MHCchain import MHCchain
from .AGchain import AGchain
from Bio.PDB.Residue import Residue
from .Fragment import Fragment
from .Chemical_components import is_aa, is_common_buffer, get_res_type, is_carbohydrate

MHC_CUTOFF = {
    "MH1:B2M": [32, 37, 32, 32],
    "CD1:B2M": [32, 37, 32, 32],
    "GA1L:B2M": [33, 37, 33, 32],           # added for GA1L case 2po6
    "MR1:B2M": [32, 37, 32, 32],
    "GA1:B2M": [32, 37, 32, 32],
    "GB:GA": [22, 32, 35, 29],
}


[docs] class TCRParser(PDBParser, MMCIFParser): def __init__(self, PERMISSIVE=True, get_header=True, QUIET=False): """ Initialise the PDB parser. This is currently set to using IMGT's numbering scheme and uses the IMGT-defined CDRs. """ self.pdb_parser = PDBParser(PERMISSIVE, get_header, None, QUIET) self.mmcif_parser = MMCIFParser(None, QUIET) self.QUIET = QUIET # Structures are numbered using anarci. self.numbering_method = "anarci" # Choose the numbering scheme and CDR definition, though by default we'll use the IMGT schemes. Have these down for reference use. self.numbering_scheme = "imgt" self.definition = "imgt" def _create_chain(self, chain, new_chain_id, numbering, chain_type): """ Create a new TCR or MHC chain. Residues before the numbered region are now ignored. """ if chain_type in ["D", "A", "B", "G"]: newchain = TCRchain(new_chain_id) elif chain_type in [ "MH1", "CD1", "B2M", "GA", "GB", "GA1L", "GA2L", "GA1", "GA2", "MR1", ]: newchain = MHCchain(new_chain_id) newchain.numbering = numbering unnumbered_list = [] added = False for residue in chain.get_list(): # check whether to add the residue to the new chain (have we got numbering for it) add = False if residue.id in numbering: if numbering[residue.id]: add = True res_id = ( residue.id[0], numbering[residue.id][0], numbering[residue.id][1], ) # field and reannotated things. # if we should add it, add it if add: added = True newresidue = Residue(res_id, residue.resname, residue.segid) for atom in residue.get_list(): newresidue.add(atom.copy()) newresidue.imgt_numbered = True newchain.add(newresidue) # else add it to the unnumbered list - this will include the HETATOMs - analyse them to find haptens. elif added: unnumbered_list.append(residue) # add the unnumbered residues into the chain - renumbering so that they follow on from the numbered regions. ended = sorted([i for i in numbering.values() if i != ""])[-1][0] for residue in unnumbered_list: ended += 1 res_id = (residue.id[0], ended, " ") newresidue = Residue(res_id, residue.resname, residue.segid) for atom in residue.get_list(): newresidue.add(atom.copy()) newchain.add(newresidue) newchain.add_unnumbered(newresidue) newchain.analyse(chain_type) return newchain def _create_scTCR_chains( self, chain, new_chain_id, numbering_1, numbering_2, chain_type1, chain_type2 ): """ Create two TCR chains to be paired up as a TCR. This is effectively a TCR chain with a modified unnumbered list generation Residues before/after the numbered region are now ignored. """ newchain1 = TCRchain(new_chain_id.lower()) newchain2 = TCRchain(new_chain_id.upper()) if chain_type2 in ["G", "A"]: # Just a trick to return self._create_scTCR_chains( chain, new_chain_id, numbering_2, numbering_1, chain_type2, chain_type1 ) newchain1.numbering = numbering_1 newchain2.numbering = numbering_2 newchains = [newchain1, newchain2] unnumbered_set = set() added = False numbered_pos = set(numbering_1.keys()) | set(numbering_2.keys()) for i, numbering in enumerate([numbering_1, numbering_2]): # Get the ith newchain newchain = newchains[i] for residue in chain.get_list(): # check whether to add the residue to the new chain (have we got numbering for it) add = False if residue.id in numbering: if numbering[residue.id]: add = True res_id = ( residue.id[0], numbering[residue.id][0], numbering[residue.id][1], ) # field and reannotated things. # if we should add it, add it if add: added = True newresidue = Residue(res_id, residue.resname, residue.segid) for atom in residue.get_list(): newresidue.add(atom.copy()) newresidue.imgt_numbered = True newchain.add(newresidue) # else add it to the unnumbered list - this will include the HETATOMs - analyse them to find haptens. elif added and residue.id not in numbered_pos: unnumbered_set.add(residue) # add the unnumbered residues into the chain - renumbering so that they follow on from the numbered regions. ended = sorted(numbering_1.values())[-1][0] # get the last numbered value. for residue in sorted(unnumbered_set, key=lambda z: z.id[1]): ended += 1 res_id = (residue.id[0], ended, " ") newresidue = Residue(res_id, residue.resname, residue.segid) for atom in residue.get_list(): newresidue.add(atom.copy()) newchain1.add(newresidue) newchain1.add_unnumbered(newresidue) newchain1.analyse(chain_type1) newchain2.analyse(chain_type2) return newchain1, newchain2
[docs] def get_tcr_structure( self, id, file, prenumbering=None, ali_dict={}, crystal_contacts=[] ): """ Post processing of the TCRPDB.Bio.PDB structure object into a TCR context. id: a string to identify the structure file: the path to the .pdb file optional: prenumbering: prenumbering for the chains in the structure. """ self.warnings = ErrorStream() # get a structure object from biopython. _, ext = os.path.splitext(file) if ext.lower() == ".pdb": structure = self.pdb_parser.get_structure(id, file) self.current_parser = self.pdb_parser elif ext.lower() in [".cif", ".mmcif"]: structure = self.mmcif_parser.get_structure(id, file) self.current_parser = self.mmcif_parser else: self.warnings.write(f"Unrecognised structure file format: {file}") raise ValueError # Create a new TCRStructure object tcrstructure = TCRStructure(structure.id) # Set and analyse header information tcrstructure.set_header(structure.header) self._analyse_header(tcrstructure) # iterate over the models in the structure # iterate backwards through the model list - delete old structure as we go # e.g. NMR structures will be extremely memory expensive (72 models!) for mid in range(len(structure.child_list) - 1, -1, -1): # add a model to the TCR structure model = structure.child_list[mid] newmodel = Model(model.id) tcrstructure.add(newmodel) # initialise holder objects for holding TCR, MHC and non-TCR/non-MHC (antigen) chains. agchains = Holder("Antigen") trchains = Holder("TCRchain") mhchains = Holder("MHCchain") newmodel.add(agchains) newmodel.add(trchains) newmodel.add(mhchains) # iterate over the chains in the model for chain in model.get_list(): # try to number the sequence found in the structure if prenumbering and chain.id in prenumbering: if len(prenumbering[chain.id]) == 2: numbering = [{}, {}] region_types = ["", ""] numbering[0], region_types[0] = self._prenumbered( chain, prenumbering, ali_dict, n=0 ) numbering[1], region_types[1] = self._prenumbered( chain, prenumbering, ali_dict, n=1 ) rtypes = sorted(region_types) # Check that we have a beta/alpha domain or gamma/delta domain if rtypes == ["A", "B"] or rtypes == ["D", "G"]: chain_type = "".join(region_types) scTCR = True # if not, just take the first region and warn the user else: chain_type = region_types[0] numbering = numbering[0] scTCR = False print( "Warning multiple variable regions of the same type (%s) found on chain %s.\nTaking the first variable region only." % (chain_type, chain.id), file=self.warnings, ) elif prenumbering[chain.id][0][-1] not in ["B", "A", "D", "G"]: numbering, chain_type, scTCR = annotate(chain) else: numbering, chain_type = self._prenumbered( chain, prenumbering, ali_dict, n=0 ) scTCR = False else: numbering, chain_type, germline_info, scTCR = annotate(chain) if chain.id in tcrstructure.header["chain_details"]: # clean this up!!! engineered = tcrstructure.header["chain_details"][chain.id][ "engineered" ] details = tcrstructure.header["chain_details"][chain.id] else: engineered = False details = {"molecule": "unknown", "engineered": False} details["genetic_origin"] = germline_info if numbering and chain_type in ["G", "D", "B", "A"]: # create a new TCR chain newchain = self._create_chain( chain, chain.id, numbering, chain_type ) newchain.set_engineered(engineered) newchain.xtra.update(details) trchains.add(newchain) elif numbering and chain_type in [ "MH1", "CD1", "GA", "GB", "B2M", "GA1L", "GA2L", "GA1", "GA2", "MR1", ]: newchain = self._create_chain( chain, chain.id, numbering, chain_type ) newchain.set_engineered(engineered) newchain.xtra.update(details) mhchains.add(newchain) elif numbering and scTCR: # Separate numbering into two domains types = list(chain_type) domain1, domain2 = numbering chain1, chain2 = self._create_scTCR_chains( chain, chain.id, domain1, domain2, types[0], types[1] ) chain1.set_engineered(engineered) chain1.xtra.update(details) chain2.set_engineered(engineered) chain2.xtra.update(details) # We know this is a TCR -- except for 2p1y. if ( chain1.child_dict[(" ", 104, " ")]["CA"] - chain2.child_dict[(" ", 104, " ")]["CA"] ) <= 22: obs_chaintypes = set([chain1.chain_type, chain2.chain_type]) if not obs_chaintypes - set(["A", "B"]): tcr = abTCR(chain1, chain2) elif not obs_chaintypes - set(["G", "D"]): tcr = gdTCR(chain1, chain2) elif not obs_chaintypes - set(["B", "D"]): tcr = abTCR(chain1, chain2) # initial way to deal with narci missclassification of alpha chains as delta chains # tcr = dbTCR(chain1, chain2) tcr.scTCR = True # newmodel.add(tcr) if chain1.id in trchains: trchains.detach_child(chain1.id) if chain2.id in trchains: trchains.detach_child(chain2.id) else: trchains.add(chain1) trchains.add(chain2) # add chain to "antigen" chains else: newchain = self._create_ag_chain(chain) newchain.set_engineered(engineered) newchain.xtra.update(details) agchains.add(newchain) # try to pair the TCR chains to form TCRs. Use a heuristic for now. if not scTCR: pairings = self._pair_chains(trchains) for pair in pairings: trchains.detach_child(pair[0].id) trchains.detach_child(pair[1].id) obs_chaintypes = set([pair[0].chain_type, pair[1].chain_type]) if not obs_chaintypes - set(["A", "B"]): tcr = abTCR(pair[0], pair[1]) elif not obs_chaintypes - set(["G", "D"]): tcr = gdTCR(pair[0], pair[1]) elif not obs_chaintypes - set(["B", "D"]): # tcr = dbTCR(pair[0], pair[1]) tcr = abTCR(pair[0], pair[1]) else: self.warnings.write( "Unusual pairing between %s (V%s) and %s (V%s) has been detected. Treating as separate TCR chains.\n" % ( pair[0].id, pair[0].chain_type, pair[1].id, pair[1].chain_type, ) ) trchains.add(pair[0]) trchains.add(pair[1]) continue newmodel.add(tcr) elif scTCR and trchains: pairings = self._pair_chains(trchains) for pair in pairings: trchains.detach_child(pair[0].id) trchains.detach_child(pair[1].id) obs_chaintypes = set([pair[0].chain_type, pair[1].chain_type]) if not obs_chaintypes - set(["A", "B"]): tcr = abTCR(pair[0], pair[1]) elif not obs_chaintypes - set(["G", "D"]): tcr = gdTCR(pair[0], pair[1]) elif not obs_chaintypes - set(["B", "D"]): tcr = abTCR(pair[0], pair[1]) # tcr = dbTCR(pair[0], pair[1]) else: self.warnings.write( "Unusual pairing between %s (V%s) and %s (V%s) has been detected. Treating as separate TCR chains.\n" % ( pair[0].id, pair[0].chain_type, pair[1].id, pair[1].chain_type, ) ) trchains.add(pair[0]) trchains.add(pair[1]) continue newmodel.add(tcr) # Pair up the MHC chains -- whether that's GA and GB or MH1 with B2M pairings = self._pair_mhc(mhchains) for pair in pairings: mhchains.detach_child(pair[0].id) mhchains.detach_child(pair[1].id) obs_chaintypes = set([pair[0].chain_type, pair[1].chain_type]) if ( not (obs_chaintypes - set(["MH1", "B2M"])) or not (obs_chaintypes - set(["GA1", "GA2"])) or not (obs_chaintypes - set(["GA1", "B2M"])) ): mhc = MH1(pair[0], pair[1]) elif not (obs_chaintypes - set(["GA", "GB"])): mhc = MH2(pair[0], pair[1]) elif not (obs_chaintypes - set(["CD1", "B2M"])) or not ( obs_chaintypes - set(["GA1L", "GA2L"])) or not ( obs_chaintypes - set(["GA1L", "B2M"]) ): mhc = CD1(pair[0], pair[1]) elif not (obs_chaintypes - set(["MR1", "B2M"])): mhc = MR1(pair[0], pair[1]) else: raise ValueError(f'MHC pairing {pair} could not be assigned.') newmodel.add(mhc) # allow instantiation of single chain MH1 type MH class if the alpha helices forming chain has been observed # allow instantiation of single chain MH2 type MH class if one of the GA or GB chain has been observed ids_to_detach = [] for mhc_chain in mhchains: if mhc_chain.chain_type in ["MH1", "GA1", "GA2"]: ids_to_detach.append(mhc_chain.id) sc_mhc = scMH1(mhc_chain) newmodel.add(sc_mhc) elif mhc_chain.chain_type in ["CD1", "GA1L"]: ids_to_detach.append(mhc_chain.id) sc_mhc = scCD1(mhc_chain) newmodel.add(sc_mhc) elif mhc_chain.chain_type in ["GA", "GB"]: ids_to_detach.append(mhc_chain.id) sc_mhc = scMH2(mhc_chain) newmodel.add(sc_mhc) warnings.warn( f"Single chain MH class II instantiated with chain type {mhc_chain.chain_type}. It is possible the other MHC class II chain has not been identified." ) for mhc_chain_id in ids_to_detach: mhchains.detach_child(mhc_chain_id) # Match MHC+antigen complex with a TCR self._match_units(newmodel, trchains, mhchains, agchains, crystal_contacts) del structure.child_list[ mid ] # delete the structure model list (goes backwards so indexing is not affected) # Delete empty holders empty_holders = [ holder.id for holder in newmodel.child_list if not holder.child_list ] for holder_id in empty_holders: newmodel.detach_child(holder_id) del structure if not self.QUIET and self.warnings.log: sys.stderr.write("\n".join(self.warnings.log)) sys.stderr.write("\n") tcrstructure.warnings = self.warnings return tcrstructure
def _analyse_header(self, header): """ Analysis of the header that has been parsed by Biopython We add information for the various chains and have a look for engineered and hapten flags. Add more information to this parser. """ if isinstance(header, TCRStructure): header = header.get_header() elif not header: header = {} header["chain_details"] = {} if "compound" in header: for compound in header["compound"]: # iteration over details. if "chain" in header["compound"][compound]: # get the chains that the compound is refering to. chains = [ c.strip().upper() for c in header["compound"][compound]["chain"].split(",") if len(c.strip()) == 1 ] for chain in chains: if chain not in header["chain_details"]: header["chain_details"][chain] = {} if "molecule" in header["compound"][compound]: # add molecule annotation to each chain for chain in chains: header["chain_details"][chain]["molecule"] = header[ "compound" ][compound]["molecule"] else: for chain in chains: header["chain_details"][chain]["molecule"] = "unknown" if "engineered" in header["compound"][compound]: if ( "no" in header["compound"][compound]["engineered"] or "false" in header["compound"][compound]["engineered"] or not header["compound"][compound]["engineered"] ): header["compound"][compound]["engineered"] = False else: header["compound"][compound]["engineered"] = True for chain in chains: header["chain_details"][chain]["engineered"] = header[ "compound" ][compound]["engineered"] else: for chain in chains: header["chain_details"][chain]["engineered"] = False else: continue # analyse the journal reference and the title for references to hapten or scfv # compile title-like text title = ( header["journal_reference"].lower() + " ".join(header["structure_reference"]).lower() ) if "journal" in header: title += header["journal"].lower() else: sys.stderr.write("Header could not be parsed") def _create_ag_chain(self, chain): """ Create a new 'antigen' chain - this just means it is not a TCR chain. """ newchain = AGchain(chain.id) for residue in chain.get_list(): newresidue = Residue(residue.id, residue.resname, residue.segid) newchain.add(newresidue) for atom in residue.get_list(): newresidue.add(atom.copy()) newchain.set_type() return newchain def _pair_chains(self, chains): """ Method to pair beta/alpha and gamma/delta chains to form TCRs. Currently this is based off of ABDB.AbPDB's chain pairing method where the distance between positions 104 are calculated using the same 22A cutoff. This is a simple heuristic for now. """ pairings = [] # We use a known distance between conserved cysteine residues at the interface points = { "B": (" ", 104, " "), "A": (" ", 104, " "), "D": (" ", 104, " "), "G": (" ", 104, " "), } for pair in combinations(chains, 2): if pair[0].chain_type != pair[1].chain_type: try: a1 = pair[0].child_dict[points[pair[0].chain_type]].child_dict["CA"] a2 = pair[1].child_dict[points[pair[1].chain_type]].child_dict["CA"] except KeyError: continue if a1 - a2 < 22: pairings.append(pair) return pairings def _pair_mhc(self, chains): """ This is a heuristic that pairs MHC chains together. In theory, we should have a GA-GB chain (MHC2) or a GA1/GA2-C (MHC1). Use an arbitrary cutoff of 45A for pairing the MHC for now. Where possible, use the conserved cysteine in the GA2/GB domains (resi 1074 or 11) and pair up with another conserved point (Cys 104 in B2M, N86 in GA) Impose an angle cutoff so that the cysteine of the B2M points in the correct orientation """ pairings = [] points = { "MH1": [(" ", 15, " "), (" ", 51, " ")], "GA1": [(" ", 15, " "), (" ", 51, " ")], "GA2": [(" ", 15, " "), (" ", 51, " ")], "CD1": [(" ", 15, " "), (" ", 51, " ")], # pretty similar to MH1 "GA1L": [(" ", 15, " "), (" ", 51, " ")], # pretty similar to MH1 "MR1": [(" ", 15, " "), (" ", 51, " ")], # pretty similar to MH1 "B2M": [(" ", 23, " "), (" ", 104, " ")], "GA": [(" ", 29, " "), (" ", 37, " ")], "GB": [(" ", 39, " "), (" ", 64, " ")], } acceptable_types = [ "MH1:B2M", "GB:GA", "CD1:B2M", "MR1:B2M", "GA1:B2M", "GA1L:B2M", ] for pair in combinations(chains, 2): # Get the chain objects c1, c2 = pair # What type of MHC are we pairing? the_type = ":".join(sorted([c1.chain_type, c2.chain_type], reverse=True)) if the_type in acceptable_types: # Sort by chain type; p1 = pair[0] if c1.chain_type > c2.chain_type else pair[1] p2 = pair[1] if c2.chain_type < c1.chain_type else pair[0] try: a1, a2 = ( p1[points[p1.chain_type][0]]["CA"], p1[points[p1.chain_type][1]]["CA"], ) a3, a4 = ( p2[points[p2.chain_type][0]]["CA"], p2[points[p2.chain_type][1]]["CA"], ) dist_array = [a3 - a1, a4 - a2, a4 - a1, a3 - a2] constants = all( [ dist_array[i] <= MHC_CUTOFF[the_type][i] for i in range(len(dist_array)) ] ) if constants: pairings.append(pair) except KeyError: continue return pairings def _get_sugar_fragments(self, sugar): """ Get connected hetatoms to form sugar molecules. """ # Make a sugar dictionary sugar = dict(list(zip([s.id for s in sugar], sugar))) # Get the connect records for the bonded atoms # 1 - 6 Record name "CONECT" # 7 - 11 Integer serial Atom serial number # 12 - 16 Integer serial Serial number of bonded atom # 17 - 21 Integer serial Serial number of bonded atom # 22 - 26 Integer serial Serial number of bonded atom # 27 - 31 Integer serial Serial number of bonded atom connect_records = {} for c in [ line.strip() for line in self.current_parser.trailer if "CONECT" in line ]: try: connect_records[int(c[6:11])] = [] except IndexError: continue for b, e in [(11, 16), (16, 21), (21, 26), (26, 31)]: try: if c[b:e].strip(): connect_records[int(c[6:11])].append(int(c[b:e])) else: break except IndexError: break except ValueError: self.warnings.write( "Warning: unexpected CONECT record format %s" % c.strip() ) monomer_atoms = [] polymers = [] if connect_records: # Get the serial_numbers to residue id. atomid_to_resid = {} for r in sugar: for atom in sugar[r]: atomid_to_resid[atom.serial_number] = sugar[r].id # Get the residue connections r_connections = {} for a in connect_records: if a in atomid_to_resid: try: r_connections[atomid_to_resid[a]].update( [ atomid_to_resid[ai] for ai in connect_records[a] if ai in atomid_to_resid ] ) except KeyError: r_connections[atomid_to_resid[a]] = set( [ atomid_to_resid[ai] for ai in connect_records[a] if ai in atomid_to_resid ] ) connected_sets = [] for r in sorted(r_connections, key=lambda x: x[1]): added = 0 for i in range(len(connected_sets)): if connected_sets[i] & r_connections[r]: connected_sets[i].update(r_connections[r]) added = 1 break if not added: connected_sets.append(r_connections[r]) n = 0 for mol in connected_sets: if len(mol) > 1: polymers.append(Fragment("sugar%d" % n)) for r in sorted(mol, key=lambda x: x[1]): polymers[n].add(sugar[r]) n += 1 else: monomer_atoms += [atom for atom in sugar[list(mol)[0]]] else: for s in sugar: monomer_atoms += [atom for atom in sugar[s]] return polymers, monomer_atoms def _find_chain_hetatoms(self, chain): """ Function for TCR and MHC chains to filter out hetatom records; this is to clean up the _match_units code below """ hetatoms, sugars = [], [] for residue in chain.get_unnumbered(): # Ignore waters and non-standard amino acids if residue.id[0] == "W" or is_aa(residue, standard=False): continue if is_carbohydrate(residue): sugars.append(residue) else: hetatoms.extend(list(residue.get_atoms())) return hetatoms, sugars def _match_units(self, model, trchains, mhchains, agchains, crystal_contacts=[]): """ Match MHC+Peptide chains to TCR chains. model is the current model - extract the TCRs from it (paired chains have been removed) trchains contains those TCR chains that have been unable to be paired to form TCRs agchains contains non-TCR chains that are potential antigens. Goal: Match TCR <-> MHC + peptide antigen. """ # Get all T-cell receptor-like objects (TCR, TCRchain), and MHC-like objects. tcell_receptors = [h for h in model if isinstance(h, TCR)] + trchains.child_list mhc_complexes = [h for h in model if isinstance(h, MHC)] + mhchains.child_list # Initialise 5 dictionaries which carries a list of atoms per chain ID. antigen_atoms, cdr_atoms, mh_atoms, antigen_hetatoms, antigen_sugars = ( defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list), ) # Look through TCR and MHC and see if there are any weird hetatoms and sugars in the structure. for tr in tcell_receptors: for cdr in tr.get_CDRs(): # Only get CDR3? if "3" not in cdr.id: continue # only look at CA or CB atoms of the CDR; this is used later. cdr_atoms[tr.id] += [ atom for atom in cdr.get_atoms() if atom.id == "CB" or atom.id == "CA" ] # get TCR type and get chain's hetatoms accordingly if isinstance(tr, TCR) and tr.get_TCR_type() == "abTCR": beta_chain = tr.get_VB() alpha_chain = tr.get_VA() antigen_hetatoms[tr.VB], antigen_sugars[tr.VB] = ( self._find_chain_hetatoms(beta_chain) ) antigen_hetatoms[tr.VA], antigen_sugars[tr.VA] = ( self._find_chain_hetatoms(alpha_chain) ) elif isinstance(tr, TCR) and tr.get_TCR_type() == "gdTCR": delta_chain = tr.get_VD() gamma_chain = tr.get_VG() antigen_hetatoms[tr.VD], antigen_sugars[tr.VD] = ( self._find_chain_hetatoms(delta_chain) ) antigen_hetatoms[tr.VG], antigen_sugars[tr.VG] = ( self._find_chain_hetatoms(gamma_chain) ) elif isinstance(tr, TCR) and tr.get_TCR_type() == "dbTCR": beta_chain = tr.get_VB() delta_chain = tr.get_VD() antigen_hetatoms[tr.VB], antigen_sugars[tr.VB] = ( self._find_chain_hetatoms(beta_chain) ) antigen_hetatoms[tr.VD], antigen_sugars[tr.VD] = ( self._find_chain_hetatoms(delta_chain) ) # Unpaired TCR chain elif isinstance(tr, TCRchain): antigen_hetatoms[tr.id], antigen_sugars[tr.id] = ( self._find_chain_hetatoms(tr) ) # Do the same for MHC. for mh in mhc_complexes: # Keep G domain atoms; Get the Helix region of MHC mh_atoms[mh.id] = [ atom for atom in mh.get_atoms() if (atom.id == "CB" or atom.id == "CA") and atom.region == "Helix" ] if isinstance(mh, MHC) and mh.MHC_type == "MH1": MH1, B2M = mh.get_MH1(), mh.get_B2M() if MH1 is not None: antigen_hetatoms[mh.MH1], antigen_sugars[mh.MH1] = ( self._find_chain_hetatoms(MH1) ) else: GA1 = mh.get_GA1() antigen_hetatoms[mh.GA1], antigen_sugars[mh.GA1] = ( self._find_chain_hetatoms(GA1) ) if B2M is not None: # handle single chain MH1 case antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = ( self._find_chain_hetatoms(B2M) ) elif isinstance(mh, MHC) and mh.MHC_type == "CD1": CD1, B2M = mh.get_CD1(), mh.get_B2M() if CD1 is not None: antigen_hetatoms[mh.CD1], antigen_sugars[mh.CD1] = ( self._find_chain_hetatoms(CD1) ) if B2M is not None: antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = ( self._find_chain_hetatoms(B2M) ) elif isinstance(mh, MHC) and mh.MHC_type == "MR1": MR1, B2M = mh.get_MR1(), mh.get_B2M() antigen_hetatoms[mh.MR1], antigen_sugars[mh.MR1] = ( self._find_chain_hetatoms(MR1) ) antigen_hetatoms[mh.B2M], antigen_sugars[mh.B2M] = ( self._find_chain_hetatoms(B2M) ) elif isinstance(mh, MHC) and mh.MHC_type == "MH2": GA, GB = mh.get_GA(), mh.get_GB() antigen_hetatoms[mh.GA], antigen_sugars[mh.GA] = ( self._find_chain_hetatoms(GA) ) antigen_hetatoms[mh.GB], antigen_sugars[mh.GB] = ( self._find_chain_hetatoms(GB) ) # Unpaired MHC chains -- if any, go here. elif isinstance(mh, MHCchain): antigen_hetatoms[mh.id], antigen_sugars[mh.id] = ( self._find_chain_hetatoms(mh) ) for antigen in agchains: antigen_atoms[antigen.id] = [ a for a in antigen.get_atoms() if a.parent.id[0] == " " or is_aa(a.parent) ] # test ATOM records or amino acid HETATM records antigen_hetatoms[antigen.id] = [ a for a in antigen.get_atoms() if a.parent.id[0].startswith("H") and not is_aa(a.parent) ] # hetatm and not an amino acid # Problem here with carbohydrate molecules as units not recognised as polymers. # Have to use connect records to join them # Then consider them in the same way. sugars = [] for chain_id in antigen_sugars: if antigen_sugars[chain_id]: polymers, monomer_atoms = self._get_sugar_fragments( antigen_sugars[chain_id] ) sugars += polymers antigen_hetatoms[chain_id] += monomer_atoms # We look through hetatms -- sometimes, hetatms can be associated to TR or MH chains, so we separate and whisk these out. # Protein/peptide entities override small molecules that are more likely to be buffer or cofactor molecules. # Get non-empty antigen hetatoms first non_empty_ag = [k for k in antigen_hetatoms if antigen_hetatoms[k]] # Pair proteins/peptides with TCR then MHC self._protein_peptide_pass( model, tcell_receptors, cdr_atoms, antigen_atoms, crystal_contacts ) self._het_sugar_pass( tcell_receptors, cdr_atoms, non_empty_ag, antigen_hetatoms, sugars, distance=8.0, ) # If a TCR does not have a detected MHC chain, then skip the remaining MHC-specific parsing bits. if not mhc_complexes: return # Have a very tight cutoff for MHCs that present het atoms (e.g. CD1 types) self._het_sugar_pass( mhc_complexes, mh_atoms, non_empty_ag, antigen_hetatoms, sugars, distance=3.5, ) if antigen_atoms: self._protein_peptide_pass( model, mhc_complexes, mh_atoms, antigen_atoms, crystal_contacts ) # Pair a TCR with an MHC and vice-versa; go through all possible combinations of TCR/MHC # We see if a CB/CA atom of the helix region of an MHC is within 8A of a TCR CDR loop's CB/CA atoms. # This is similar to the _protein_peptide_pass algorithm; we find the number of contacts between MHC and TCR, # and use the MHC with highest no. of contacts contact_freq = defaultdict(int) tr_mh_pairs = list(product(tcell_receptors, mhc_complexes)) for tr, mh in tr_mh_pairs: ns = NeighborSearch(cdr_atoms[tr.id]) for atom in mh_atoms[mh.id]: # This is a generous cutoff to be used for now. contacts = ns.search(atom.get_coord(), 8.0, level="R") for c in contacts: contact_freq[(tr.id, mh.id)] += 1 # Sort TR-MH pairs by number of contacts and then get the highest-frequency pairs sorted_contacts = sorted( list(contact_freq.items()), key=lambda z: z[1], reverse=True ) paired_tr_mh = set() for pair, contacts in sorted_contacts: tr, mh = pair # If the TCR has already been paired, or if we know that the TCR and MHC are forming crystal contacts, move on. if tr in paired_tr_mh or (tr, mh) in crystal_contacts: continue model[tr]._add_mhc(model[mh]) model[mh]._add_tcr(model[tr]) paired_tr_mh.add(tr) def _protein_peptide_pass( self, model, complexes, receptor_atoms, antigen_atoms, crystal_contacts=[] ): """ This is a generic method to process which proteins/peptides belong to a TCR or MHC. Needs testing. Args: complexes: list of TCR/TCRchain objects or MHC/MHCchain objects receptor_atoms: list of atom subset that will likely contact the antigen (e.g. cdr_atoms) antigen_atoms: list of atoms in the antigen. """ ns = NeighborSearch( [atom for chain in receptor_atoms for atom in receptor_atoms[chain]] + [atom for chain in antigen_atoms for atom in antigen_atoms[chain]] ) contacts = [con for con in ns.search_all(8.0, "R")] contact_freq = defaultdict(lambda: defaultdict(int)) # all_cpx_chains is a dictionary that has a TCR/MHC chain as a key and the ID of the TCR/MHC as value all_cpx_chains = dict() for cpx in complexes: cpx_ch = list(cpx.id) for c in cpx_ch: all_cpx_chains[c] = cpx.id # trids stores all paired/unpaired TR chains cpxids = set(all_cpx_chains.values()) ags = set() for c in contacts: p1 = str(c[0].parent.id) # get the chain id p2 = str(c[1].parent.id) # Reject cases where contacts are from the same chain, or the combination of chains is a TCR potential_contact = p1 + p2 potential_contact2 = p2 + p1 if ( p1 == p2 or potential_contact in contact_freq or potential_contact2 in contact_freq ): continue # If the potential contacting set of chains (p1+p2) is not a TR and p1 is a TR chain but p2 is NOT a TR chain, then p2 is an AG if ( (potential_contact not in cpxids) and (p1 in all_cpx_chains) and (p2 not in all_cpx_chains) ): T = all_cpx_chains[p1] ag = p2 # If the second set of potential contacting set of chains (p2+p1) is not a TR and p2 is a TR chain but p1 is NOT a TR chain, # then p1 is an AG elif ( (potential_contact2 not in cpxids) and (p2 in all_cpx_chains) and (p1 not in all_cpx_chains) ): T = all_cpx_chains[p2] ag = p1 else: continue # T is either the paired TCR id or an id of a single TCR chain contact_freq[T][ag] += 1 ags.add(ag) # Iterate over the TR identifiers for cpx_id in cpxids: # If there are detected antigen contacts if contact_freq[cpx_id]: # Get the antigen ag = max(contact_freq[cpx_id], key=lambda x: contact_freq[cpx_id][x]) if (cpx_id, ag) not in crystal_contacts: model[cpx_id].antigen = ( [] ) # disregard smaller antigens if peptide or protein present. model[cpx_id]._add_antigen( model[ag] ) # pair up an antigen to the TCR. # Remove the antigen now, as it is paired up with a TR. if ag in ags: ags.remove(ag) # iterate over the remaining antigens to see if they are also bound. for ag in ags: cmax = 0 for C in contact_freq: if ag in contact_freq[C] and (C, ag) not in crystal_contacts: if contact_freq[C][ag] > cmax: paired_cpx = C cmax = contact_freq[C][ag] if cmax: if len(contact_freq) > 1: self.warnings.write( "Crystal Contact Warning: antigen %s has been paired with TCR %s" % (str(ag), str(paired_cpx)) ) model[paired_cpx]._add_antigen(model[ag]) else: model[paired_cpx]._add_antigen(model[ag]) def _het_sugar_pass( self, receptors, receptor_atoms, non_empty_ag, antigen_hetatoms, sugars, distance=8.0, ): """ """ # Iterate through every possible pair of TR and hetatom chain for rec, antigen_het in product(receptors, non_empty_ag): # Initialise a NeighborSearch based on the atoms for a particualr chain of hetatoms ns = NeighborSearch(antigen_hetatoms[antigen_het]) # Look through CDR atoms (CA/CB) for atom in receptor_atoms[rec.id]: # use 8.0A from the CDR CA/CB to the antigen. Using level = "R" returns Residue objects. contacts = ns.search(atom.get_coord(), distance, level="R") if contacts: for contact in contacts: # we assume that each contact residue is a single molecule (need to test its not just a residue) if self._check_het_antigen(contact): residue_type = get_res_type(contact) if residue_type == "Hapten": self.warnings.write( """Warning: Multiple hapten-antigen like molecules found in binding site - this needs attention as could be solvent/cofactor.""" ) if residue_type == "non-polymer": contact.type = "Hapten" # add a antigen type attribute to the residue contact.get_type = ( lambda: "Hapten" ) # add a get antigen type method to the residue elif residue_type == "nucleic-acid": contact.type = "nucleic-acid" # add a antigen type attribute to the residue contact.get_type = ( lambda: "nucleic-acid" ) # add a get antigen type method to the residue elif residue_type == "saccharide": contact.type = "carbohydrate" # add a antigen type attribute to the residue contact.get_type = ( lambda: "carbohydrate" ) # add a get antigen type method to the residue rec._add_antigen(contact) # Iterate through sugar fragments # for rec, sugar_fragment in product( receptors, sugars ): # ns = NeighborSearch([atom for atom in sugar_fragment.get_atoms()]) # for atom in receptor_atoms[rec.id]: # contacts = ns.search(atom.get_coord(), distance, level="R") # if contacts: # sugar_fragment.type = "carbohydrate" # add a antigen type attribute to the fragment # sugar_fragment.get_type = lambda: "carbohydrate" # add a get antigen type method to the fragment # rec._add_antigen(sugar_fragment) def _check_het_antigen(self, residue): """ Method to perform checks on a potential hetatm residue. 1. Check that it is not an amino acid - we don't want a modified residue to be found as a hapten. 2. Check that the residue name is not a common buffer using high frequency residue codes. If we throw it out due to check 3 it will be reported to user. """ # check 1 # no amino acids if is_aa(residue, standard=False): return False # check 2 # check for common buffers/unlikely haptens if is_common_buffer(residue): if not self.QUIET: self.warnings.write( "Common molecule %s found in the binding site - not considered an antigen" % residue.get_resname() ) return False # add more checks as problems arise return True def _prenumbered(self, chain, prenumbering, ali_dict={}, n=0): """ Method to deal with numbering supplied by the user. (or from the database) """ if ali_dict: ali_dict = ali_dict[chain.id][n] annotation, chain_type = prenumbering[chain.id][n] try: sequence_list, sequence_str, warnings = extract_sequence( chain, return_warnings=True ) numbering = align_numbering(annotation, sequence_list, ali_dict) except ( AssertionError ): # If the user has an alignment file generated before hetatoms included sequence_list, sequence_str, warnings = extract_sequence( chain, return_warnings=True, ignore_hets=True ) numbering = align_numbering(annotation, sequence_list, ali_dict) self.warnings.log += warnings return numbering, chain_type
# class error_stream: # def __init__(self): # self.log = [] # def __str__(self): # return "\n".join(self.log) # def __repr__(self): # return self.__str__() # def write(self, s): # self.log.append(str(s).strip("\n"))