Source code for stcrpy.tcr_formats.tcr_haddock

import os
import re
import warnings
import numpy as np

import Bio
from Bio.PDB.Superimposer import Superimposer

from .. import tcr_processing


[docs] class HADDOCKFormatter: def __init__(self, save_dir: str = None): """Constructor HADDOCK formatting object. Args: save_dir (str, optional): Path to save formatted files to. Defaults to None. """ self.save_dir = save_dir if save_dir is not None else "."
[docs] def tcr_to_haddock(self, tcr: "TCR"): """Bound reformatting of TCR structure object to HADDOCK compatible PDB file. Args: tcr (TCR): TCR structure object """ self.write_TCR_pdb_file(tcr, self.save_dir)
[docs] def pMHC_to_haddock(self, mhc: "MHC", antigen: list["Antigen"]): """Bound reformatting of MHC and antigen structures object to HADDOCK compatible PDB file. Args: mhc (MHC): MHC structure object antigen (Antigen): Antigen structure object """ self.write_antigen_pdb_file(mhc, antigen, self.save_dir)
[docs] def write_TCR_pdb_file(self, tcr: "TCR", save_dir: str): """ Writes TCR structure to a PDB file in a format HADDOCK can deal with. Generates a PDB file, a mapping from the old to the new numbering, and a list of active residues to restrain the HADDOCK simulation. Args: tcr (TCR): The TCR structure. save_dir (str): The directory to save the files (default is current directory). """ tcr_id = f"{tcr.parent.parent.id}_{tcr.id}" new_tcr_structure = Bio.PDB.Model.Model(id=0) residue_conversion = {} for i, chain in enumerate(tcr.get_chains()): residue_conversion[chain.id] = {} new_chain = Bio.PDB.Chain.Chain(id=chain.id) selected_residues = [ res for res in Bio.PDB.Selection.unfold_entities(chain, "R") if res.id[1] in list(range(1, 130)) ] for residue in selected_residues: # handle insertion numbering for HADDOCK new_residue = residue.copy() if new_residue.id[-1] != " ": new_residue.id = ( new_residue.id[0], 10 * new_residue.id[1] + (200 * i) + imgt_insertion_char_to_int(new_residue.id[-1]), " ", ) else: new_residue.id = ( new_residue.id[0], new_residue.id[1] + (200 * i), new_residue.id[-1], ) if new_residue.id != residue.id: residue_conversion[chain.id][residue.id] = new_residue.id new_chain.add(new_residue) new_tcr_structure.add(new_chain) if not os.path.exists(os.path.join(save_dir, tcr_id)): os.mkdir(os.path.join(save_dir, tcr_id)) with open( os.path.join(save_dir, f"{tcr_id}/{tcr_id}_haddock_active_residues.txt"), "w", ) as f: # get cdr numbering active_residues = [] for chain in tcr.get_chains(): cdrs = chain.get_CDRs() res_list = [r.id for cdr in cdrs for r in cdr.get_residues()] for res_key in res_list: # res_key = (" ", *res[0]) if res_key in residue_conversion[chain.id]: active_residues.append(residue_conversion[chain.id][res_key][1]) else: active_residues.append(res_key[1]) f.write("TCR ACTIVE RESIDUES FOR HADDOCK\n") f.write(",".join([str(r) for r in active_residues])) f.write("\n") with open( os.path.join(save_dir, f"{tcr_id}/{tcr_id}_haddock_renumbering.txt"), "w" ) as f: f.write("TCR RESIDUE RENUMBERING FOR HADDOCK\n") for chain_id in residue_conversion: for res, new_res in residue_conversion[chain_id].items(): r_as_str = f"({chain_id},({res[0]},{res[1]},{res[2]}),({new_res[0]},{new_res[1]},{new_res[2]})\n" f.write(r_as_str) pdb_io = Bio.PDB.PDBIO() pdb_io.set_structure(new_tcr_structure) filename = os.path.join(save_dir, f"{tcr_id}/{tcr_id}_tcr_for_docking.pdb") pdb_io.save(filename) return filename
[docs] def write_antigen_pdb_file( self, mhc: "MHC", antigen: list["Antigen"], save_dir: str ): """ Writes the antigen PDB file for docking with HADDOCK. Generates a PDB file, a file containing the renumbering mapping, and a list of active residues to restrict the simulation. Args: mhc (MHC): MHC structure object. antigen (list[Antigen]): List containing antigen chain. Should be length 1. save_dir (str, optional): The directory to save the PDB file. Defaults to ".". Returns: str: The filename of the saved antigen PDB file. """ # structure = p.get_structure() # chains = [i for i in structure.get_antigens() if i.id in antigen_chain_ids] mhc_chains = [c for c in mhc.get_chains() if c.chain_type != "B2M"] antigen_chains = mhc_chains + antigen mhc_id = f"{mhc.parent.parent.id}_MHC_{''.join([c.id for c in antigen_chains])}" new_antigen_structure = Bio.PDB.Model.Model(id=0) residue_conversion = {} for i, chain in enumerate(antigen_chains): residue_conversion[chain.id] = {} new_chain = Bio.PDB.Chain.Chain(id=chain.id) for residue in chain.get_residues(): # handle insertion numbering for HADDOCK new_residue = residue.copy() new_residue.id = ( new_residue.id[0], new_residue.id[1] + (500 * i), new_residue.id[-1], ) if new_residue.id != residue.id: residue_conversion[chain.id][residue.id] = new_residue.id new_chain.add(new_residue) new_antigen_structure.add(new_chain) if not os.path.exists(os.path.join(save_dir, mhc_id)): os.mkdir(os.path.join(save_dir, mhc_id)) with open( os.path.join(save_dir, f"{mhc_id}/{mhc_id}_haddock_active_residues.txt"), "a", ) as f: # get peptide numbering and select as active active_residues = [] for chain in antigen: res_list = list(r.id for r in chain.get_residues()) for res_key in res_list: if res_key in residue_conversion[chain.id]: active_residues.append(residue_conversion[chain.id][res_key][1]) else: active_residues.append(res_key[1]) f.write("ANTIGEN ACTIVE RESIDUES FOR HADDOCK\n") f.write(",".join([str(r) for r in active_residues])) f.write("\n") with open( os.path.join(save_dir, f"{mhc_id}/{mhc_id}_haddock_renumbering.txt"), "a" ) as f: f.write("ANTIGEN RESIDUE RENUMBERING FOR HADDOCK\n") for chain in residue_conversion: for res, new_res in residue_conversion[chain].items(): r_as_str = f"({chain},({res[0]},{res[1]},{res[2]}),({new_res[0]},{new_res[1]},{new_res[2]})\n" f.write(r_as_str) pdb_io = Bio.PDB.PDBIO() pdb_io.set_structure(new_antigen_structure) filename = os.path.join(save_dir, f"{mhc_id}/{mhc_id}_antigen_for_docking.pdb") pdb_io.save(filename) return filename
[docs] class HADDOCKResultsParser: def __init__( self, haddock_results_dir: str, tcr_renumbering_file: str = None, pmhc_renumbering_file: str = None, ): """Parser for results from HADDOCK simulations. Renumbers TCR, MHC and Antigen using renumbering files, and parses result metrics. Args: haddock_results_dir (str): path to HADDOCK simulation results. tcr_renumbering_file (str, optional): path to text file containing TCR renumbering to restore from HADDOCK compatible numbering. Defaults to None. pmhc_renumbering_file (str, optional): path to text file containing MHC and antigen renumbering to restore from HADDOCK compatible numbering. Defaults to None. """ self.haddock_results_dir = haddock_results_dir self.tcr_renumbering_file = tcr_renumbering_file self.pmhc_renumbering_file = pmhc_renumbering_file if self.haddock_results_dir.endswith(".tgz"): warnings.warn( "HADDOCK results are compressed. Decompress results before proceeding." )
[docs] def renumber_all_haddock_predictions(self): """Renumber all haddock predictions contained in results folder. Requires standard HADDOCK output directory format.""" path = os.path.join(self.haddock_results_dir, "structures/it1/") pattern = re.compile(r"complex_.*\.pdb") for filename in os.listdir(path): if pattern.match(filename): file_path = os.path.join(path, filename) self.renumber_haddock_prediction( file_path, self.tcr_renumbering_file, self.pmhc_renumbering_file, )
[docs] def renumber_haddock_prediction( self, docked_prediction_file: str, haddock_renumbering_file: str, antigen_renumbering_file: str = None, ) -> Bio.PDB.Model.Model: """ Renumber the HADDOCK prediction based on the renumbering files. Args: docked_prediction_file (str): Path to the docked prediction file. haddock_renumbering_file (str): Path to the HADDOCK renumbering file. antigen_renumbering_file (str, optional): Path to the antigen renumbering file. Needed for TCR only PDBs with no antigen. Defaults to None. Returns: Bio.PDB.Model.Model: The renumbered HADDOCK prediction. Raises: ValueError: If the renumbering index is not found in the renumbering file. """ # initialise file parsers tcr_parser = tcr_processing.TCRParser.TCRParser() bio_parser = Bio.PDB.PDBParser() # find chain ID of TCR to distinguish TCR from antigen tcr_chain_id = list( tcr_parser.get_tcr_structure("tmp", docked_prediction_file).get_TCRchains() )[0].get_id() docked_prediction = bio_parser.get_structure("docked", docked_prediction_file) # get chains of HADDOCK dock merged_tcr_chain = docked_prediction[0][tcr_chain_id] merged_antigen_chain = [ chain for chain in docked_prediction.get_chains() if chain.id != merged_tcr_chain.id ][0] # get renumbering with open(haddock_renumbering_file, "r") as f: lines = f.readlines() try: antigen_renumbering_index = lines.index( "ANTIGEN RESIDUE RENUMBERING FOR HADDOCK\n" ) antigen_renumber_indices = ( antigen_renumbering_index + 1, -1, ) except ValueError: antigen_renumbering_index = -1 antigen_renumber_indices = ( -1, -1, ) tcr_renumber_indices = (1, antigen_renumbering_index) # if antigen renumbering file is provided, get antigen renumbering from there if antigen_renumbering_file is not None: lines = ( lines[: antigen_renumber_indices[0] - 1] if antigen_renumber_indices[0] != -1 else lines ) tcr_renumber_indices = (1, len(lines) - 1) antigen_renumber_indices = (len(lines) + 1, -1) with open(antigen_renumbering_file, "r") as f: antigen_xtal_lines = f.readlines() antigen_renumbering_index = antigen_xtal_lines.index( "ANTIGEN RESIDUE RENUMBERING FOR HADDOCK\n" ) lines.extend(antigen_xtal_lines[antigen_renumbering_index:]) # renumber TCR by creating new PDB model and populating with residues tcr_parsed_lines = list( map( parse_renumbered_line, lines[tcr_renumber_indices[0] : tcr_renumber_indices[1]], ) ) changed_tcr_chain_ids, _, _ = list(zip(*tcr_parsed_lines)) tcr = Bio.PDB.Model.Model(id=0) if len(set(changed_tcr_chain_ids)) > 1: id_for_conserved_numbered_chain = min( set(changed_tcr_chain_ids), key=changed_tcr_chain_ids.count ) else: id_for_conserved_numbered_chain = merged_tcr_chain.id tcr.add(Bio.PDB.Chain.Chain(id=id_for_conserved_numbered_chain)) try: tcr.add( Bio.PDB.Chain.Chain( id=max(set(changed_tcr_chain_ids), key=changed_tcr_chain_ids.count) ) ) second_tcr_chain_id = None except Bio.PDB.PDBExceptions.PDBConstructionException: for id_to_try in "ABCDEFGH": try: tcr.add(Bio.PDB.Chain.Chain(id=id_to_try)) second_tcr_chain_id = id_to_try break except Bio.PDB.PDBExceptions.PDBConstructionException: continue renumbered_residues = {} for renumbering in tcr_parsed_lines: try: residue = merged_tcr_chain[renumbering[-1]] merged_tcr_chain.detach_child(renumbering[-1]) residue.id = renumbering[1] if second_tcr_chain_id is None: if renumbering[0] not in renumbered_residues: renumbered_residues[renumbering[0]] = [] renumbered_residues[renumbering[0]].append(residue) else: if second_tcr_chain_id not in renumbered_residues: renumbered_residues[second_tcr_chain_id] = [] renumbered_residues[second_tcr_chain_id].append(residue) # tcr[renumbering[0]].add(residue) except KeyError as e: warnings.warn( f"""Renumbering {renumbering} failed with Key Error {e}""" ) for residue in merged_tcr_chain.get_residues(): if id_for_conserved_numbered_chain not in renumbered_residues: renumbered_residues[id_for_conserved_numbered_chain] = [] renumbered_residues[id_for_conserved_numbered_chain].append(residue) # tcr[id_for_conserved_numbered_chain].add(residue) # sort the residues for chain_id in renumbered_residues: sorted_residues = sort_residues_by_imgt_numbering( renumbered_residues[chain_id] ) for res in sorted_residues: tcr[chain_id].add(res) # renumber antigen antigen_parsed_lines = list( map( parse_renumbered_line, lines[antigen_renumber_indices[0] :], ) ) changed_antigen_chain_ids, _, _ = list(zip(*antigen_parsed_lines)) try: tcr.add(Bio.PDB.Chain.Chain(id=merged_antigen_chain.id)) except Bio.PDB.PDBExceptions.PDBConstructionException: for id_to_try in "ABCDEFGH": if id_to_try == set(changed_antigen_chain_ids).pop(): continue try: tcr.add(Bio.PDB.Chain.Chain(id=id_to_try)) merged_antigen_chain.id = id_to_try break except Bio.PDB.PDBExceptions.PDBConstructionException: continue assert ( len(set(changed_antigen_chain_ids)) == 1 ), "More than one chain renumbered in renumbering file" try: tcr.add(Bio.PDB.Chain.Chain(id=set(changed_antigen_chain_ids).pop())) new_antigen_chain_id = None except Bio.PDB.PDBExceptions.PDBConstructionException: for id_to_try in "ABCDEFGH": if id_to_try == set(changed_antigen_chain_ids).pop(): continue try: tcr.add(Bio.PDB.Chain.Chain(id=id_to_try)) new_antigen_chain_id = id_to_try break except Bio.PDB.PDBExceptions.PDBConstructionException: continue for renumbering in antigen_parsed_lines: try: residue = merged_antigen_chain[renumbering[-1]] merged_antigen_chain.detach_child(renumbering[-1]) residue.id = renumbering[1] if new_antigen_chain_id is None: tcr[renumbering[0]].add(residue) else: tcr[new_antigen_chain_id].add(residue) except KeyError as e: warnings.warn( f"""Renumbering {renumbering} failed with Key Error {e}""" ) for residue in merged_antigen_chain.get_residues(): tcr[merged_antigen_chain.id].add(residue) # create structure object and save tcr_struct = Bio.PDB.Structure.Structure(id=0) tcr_struct.add(tcr) pdb_io = Bio.PDB.PDBIO() pdb_io.set_structure(tcr_struct) save_to = "renumbered_" + docked_prediction_file.split("/")[-1] filename = os.path.join(*docked_prediction_file.split("/")[:-1], save_to) pdb_io.save(filename)
[docs] def get_haddock_scores(self) -> "pandas.DataFrame": """Retrieve HADDOCK energy scoes and RMSD evaluations from simulation output: \nColumns: \n "haddock_score", \n "interface_rmsd", \n "ligand_rmsd", \n "frac_common_contacts", \n "E_vdw", \n "E_elec", \n "E_air", \n "E_desolv", \n "ligand_rmsd_2", \n "cluster_id", Raises: FileNotFoundError: HADDOCK file contianing scores not found. Returns: pandas.DataFrame: DataFrame with HADDOCK simulation metrics. """ import pandas as pd import os haddock_columns = [ # 'idx', "haddock_score", "interface_rmsd", "ligand_rmsd", "frac_common_contacts", "E_vdw", "E_elec", "E_air", "E_desolv", "ligand_rmsd_2", "cluster_id", ] haddock_scores_file = "complex_HS_irmsd_lrmsd_fnat.list" try: df = pd.read_csv( os.path.join(self.haddock_results_dir, haddock_scores_file), sep=" ", names=haddock_columns, ) return df except FileNotFoundError: raise FileNotFoundError( f"File: complex_HS_irmsd_lrmsd_fnat.list containing HADDOCK docking metrics not found in {self.haddock_results_dir}" )
[docs] def imgt_insertion_char_to_int(char: str) -> int: """ Converts an IMGT insertion character to an integer. Args: char (str): The IMGT insertion character. Returns: int: The corresponding integer value. """ return ord(char) - ord("A") + 1
[docs] def parse_renumbered_line(line: str) -> tuple: """ Parses a renumbered line from a file and extracts the chain ID, original numbering, and HADDOCK numbering. Args: line (str): The renumbered line to parse. Returns: tuple: A tuple containing the chain ID, original numbering, and HADDOCK numbering. Example: line = "(O,( ,3, ),( ,203, )" result = parse_renumbered_line(line) # Output: (O)', ('', '3', ''), ('', '203', '')) """ chain_id = line[1] content = re.findall(r"\((.*?)\)", line) original_numbering = tuple( int(x.strip()) if x.isdigit() else x.strip() for x in content[0].split("(")[-1].split(",") ) haddock_numbering = tuple( int(x.strip()) if x.isdigit() else x.strip() for x in re.split(r",\s*", content[1]) ) def add_empty_id(numbering): return tuple(x if x != "" else " " for x in numbering) return chain_id, add_empty_id(original_numbering), add_empty_id(haddock_numbering)
[docs] def sort_residues_by_imgt_numbering( residues: "list[Bio.PDB.Residue]", ) -> "list[Bio.PDB.Residue]": """Sort residues in order by IMGT numbering. Args: residues (list[Bio.PDB.Residue]): List of IMGT numbered residues. Returns: list[Bio.PDB.Residue]: Sorted list of IMGT numbered residuess. """ sorted_residues = sorted(residues, key=lambda x: (x.id[1], x.id[2])) imgt_nr_112_subsequence = [ (i, res) for i, res in enumerate(sorted_residues) if res.id[1] == 112 ] if len(imgt_nr_112_subsequence) > 0: indices, imgt_nr_112_subsequence = list(zip(*imgt_nr_112_subsequence)) sorted_imgt_nr_112_subsequence = sorted( imgt_nr_112_subsequence, key=lambda x: x.id[2], reverse=True ) for i, idx in enumerate(indices): sorted_residues[idx] = sorted_imgt_nr_112_subsequence[i] return sorted_residues