Source code for graphein.construct_graphs

"""Class for working with Protein Structure Graphs"""
# %%
# Graphein
# Author: Arian Jamasb <arian@jamasb.io>
# License: MIT
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein
import os
import glob
import re
import pandas as pd
import numpy as np
import dgl
import subprocess
import networkx as nx
import torch as torch
import torch.nn.functional as F
from torch_geometric.data import Data
from biopandas.pdb import PandasPdb
from Bio.PDB import *
from Bio.PDB.DSSP import residue_max_acc, dssp_dict_from_pdb_file
from Bio.PDB.Polypeptide import aa1, one_to_three
from dgllife.utils import mol_to_bigraph, mol_to_complete_graph, mol_to_nearest_neighbor_graph
from dgllife.utils import BaseBondFeaturizer, BaseAtomFeaturizer, CanonicalAtomFeaturizer, CanonicalBondFeaturizer
from rdkit.Chem import MolFromPDBFile
from sklearn.metrics import pairwise_distances
from sklearn import preprocessing
from sklearn.neighbors import kneighbors_graph
from scipy import spatial


# Todo add SS featuriser for Mol Graph?
# Todo atom featuriser
# Todo create SS element-level graph
# Mol graph Nearest Neighbour


[docs]class ProteinGraph(object):
[docs] def __init__(self, granularity, keep_hets, insertions, node_featuriser, get_contacts_path, pdb_dir, contacts_dir, exclude_waters=True, covalent_bonds=True, include_ss=True, include_ligand=False, intramolecular_interactions=None, graph_constructor=None, edge_featuriser=None, edge_distance_cutoff=None, verbose=True, deprotonate=False, remove_string_labels=False, long_interaction_threshold=None): """ Initialise ProteinGraph Generator Class :param granularity: Specifies granularity of the graph construction. {'atom', 'CA', 'CB'}. CA = Alpha Carbon, CB = Beta Carbon :type granularity: str :param keep_hets: Keep heteroatoms present in the PDB file. Typically, these correspond to metal ions or modified residues (e.g. MSE) :type keep_hets: bool :param insertions: Keep atoms/residues with multiple insertion positions. Multiple insertions exist when the electron density is too vague to define a single insertion :type insertions: bool :param node_featuriser: DGL Node featuriser for atom-level graphs. Canonical Featurises recommended. :type node_featuriser: DGL Node Featuriser :param pdb_dir: Directory to PDB files. We will download .PDB files to this folder if you don't have an existing local copy of the requisite structure :type pdb_dir: str :param contacts_dir: Directory to GetContacts files :type contacts_dir: str :param exclude_waters: Specifies inclusion of water molecules. Not yet fully operational. :type exclude_waters: bool :param covalent_bonds: Specifies inclusion of covalent backbone. E.g. joins adjacent residues in the sequence :type covalent_bonds: bool :param include_ss: Specifies inclusion of secondary structure features computed by DSSP. Future warning: this will be changed in a subsequent update for managing feature selection. :type include_ss: bool :param include_ligand: Not yet implemented. Will specify option to include bound ligand(s) in the graph. :type include_ligand: bool :param intramolecular_interactions: List of allowable intramolecular interactions to include from GetContacts. ['sb', 'pc', 'ps', 'ts', 'vdw', 'hb', 'hbb', 'hbsb', 'hbbb', 'hbss', 'wb', 'wb2', 'hblb', 'hbls', 'lwb', 'lwb2', 'hp']. See https://getcontacts.github.io/interactions.html for details. :type intramolecular_interactions: list :param edge_distance_cutoff: Distance in angstroms specifying cutoff distance for constructing an edge when using distance construction :type edge_distance_cutoff: float :param long_interaction_threshold: Specifies minimum distance in sequence for two nodes to be connected :type long_interaction_threshold: int """ self.long_interaction_threshold = long_interaction_threshold self.remove_string_labels = remove_string_labels self.verbose = verbose self.edge_distance_cutoff = edge_distance_cutoff self.include_ligand = include_ligand self.include_ss = include_ss self.granularity = granularity self.keep_hets = keep_hets self.insertions = insertions self.node_featuriser = node_featuriser self.embedding_dict = { 'meiler': { 'ALA': [1.28, 0.05, 1.00, 0.31, 6.11, 0.42, 0.23], 'GLY': [0.00, 0.00, 0.00, 0.00, 6.07, 0.13, 0.15], 'VAL': [3.67, 0.14, 3.00, 1.22, 6.02, 0.27, 0.49], 'LEU': [2.59, 0.19, 4.00, 1.70, 6.04, 0.39, 0.31], 'ILE': [4.19, 0.19, 4.00, 1.80, 6.04, 0.30, 0.45], 'PHE': [2.94, 0.29, 5.89, 1.79, 5.67, 0.30, 0.38], 'TYR': [2.94, 0.30, 6.47, 0.96, 5.66, 0.25, 0.41], 'PTR': [2.94, 0.30, 6.47, 0.96, 5.66, 0.25, 0.41], 'TRP': [3.21, 0.41, 8.08, 2.25, 5.94, 0.32, 0.42], 'THR': [3.03, 0.11, 2.60, 0.26, 5.60, 0.21, 0.36], 'TPO': [3.03, 0.11, 2.60, 0.26, 5.60, 0.21, 0.36], 'SER': [1.31, 0.06, 1.60, -0.04, 5.70, 0.20, 0.28], 'SEP': [1.31, 0.06, 1.60, -0.04, 5.70, 0.20, 0.28], 'ARG': [2.34, 0.29, 6.13, -1.01, 10.74, 0.36, 0.25], 'LYS': [1.89, 0.22, 4.77, -0.99, 9.99, 0.32, 0.27], 'KCX': [1.89, 0.22, 4.77, -0.99, 9.99, 0.32, 0.27], 'LLP': [1.89, 0.22, 4.77, -0.99, 9.99, 0.32, 0.27], 'HIS': [2.99, 0.23, 4.66, 0.13, 7.69, 0.27, 0.30], 'ASP': [1.60, 0.11, 2.78, -0.77, 2.95, 0.25, 0.20], 'GLU': [1.56, 0.15, 3.78, -0.64, 3.09, 0.42, 0.21], 'PCA': [1.56, 0.15, 3.78, -0.64, 3.09, 0.42, 0.21], 'ASN': [1.60, 0.13, 2.95, -0.60, 6.52, 0.21, 0.22], 'GLN': [1.56, 0.18, 3.95, -0.22, 5.65, 0.36, 0.25], 'MET': [2.35, 0.22, 4.43, 1.23, 5.71, 0.38, 0.32], 'MSE': [2.35, 0.22, 4.43, 1.23, 5.71, 0.38, 0.32], 'PRO': [2.67, 0.00, 2.72, 0.72, 6.80, 0.13, 0.34], 'CYS': [1.77, 0.13, 2.43, 1.54, 6.35, 0.17, 0.41], 'CSO': [1.77, 0.13, 2.43, 1.54, 6.35, 0.17, 0.41], 'CAS': [1.77, 0.13, 2.43, 1.54, 6.35, 0.17, 0.41], 'CAF': [1.77, 0.13, 2.43, 1.54, 6.35, 0.17, 0.41], 'CSD': [1.77, 0.13, 2.43, 1.54, 6.35, 0.17, 0.41], 'UNKNOWN': [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00] }, 'kidera': { 'A': [-1.56, -1.67, -0.97, -0.27, -0.93, -0.78, -0.2, -0.08, 0.21, -0.48], 'C': [0.12, -0.89, 0.45, -1.05, -0.71, 2.41, 1.52, -0.69, 1.13, 1.1], 'E': [-1.45, 0.19, -1.61, 1.17, -1.31, 0.4, 0.04, 0.38, -0.35, -0.12], 'D': [0.58, -0.22, -1.58, 0.81, -0.92, 0.15, -1.52, 0.47, 0.76, 0.7], 'G': [1.46, -1.96, -0.23, -0.16, 0.1, -0.11, 1.32, 2.36, -1.66, 0.46], 'F': [-0.21, 0.98, -0.36, -1.43, 0.22, -0.81, 0.67, 1.1, 1.71, -0.44], 'I': [-0.73, -0.16, 1.79, -0.77, -0.54, 0.03, -0.83, 0.51, 0.66, -1.78], 'H': [-0.41, 0.52, -0.28, 0.28, 1.61, 1.01, -1.85, 0.47, 1.13, 1.63], 'K': [-0.34, 0.82, -0.23, 1.7, 1.54, -1.62, 1.15, -0.08, -0.48, 0.6], 'M': [-1.4, 0.18, -0.42, -0.73, 2.0, 1.52, 0.26, 0.11, -1.27, 0.27], 'L': [-1.04, 0.0, -0.24, -1.1, -0.55, -2.05, 0.96, -0.76, 0.45, 0.93], 'N': [1.14, -0.07, -0.12, 0.81, 0.18, 0.37, -0.09, 1.23, 1.1, -1.73], 'Q': [-0.47, 0.24, 0.07, 1.1, 1.1, 0.59, 0.84, -0.71, -0.03, -2.33], 'P': [2.06, -0.33, -1.15, -0.75, 0.88, -0.45, 0.3, -2.3, 0.74, -0.28], 'S': [0.81, -1.08, 0.16, 0.42, -0.21, -0.43, -1.89, -1.15, -0.97, -0.23], 'R': [0.22, 1.27, 1.37, 1.87, -1.7, 0.46, 0.92, -0.39, 0.23, 0.93], 'T': [0.26, -0.7, 1.21, 0.63, -0.1, 0.21, 0.24, -1.15, -0.56, 0.19], 'W': [0.3, 2.1, -0.72, -1.57, -1.16, 0.57, -0.48, -0.4, -2.3, -0.6], 'V': [-0.74, -0.71, 2.04, -0.4, 0.5, -0.81, -1.07, 0.06, -0.46, 0.65], 'Y': [1.38, 1.48, 0.8, -0.56, -0.0, -0.68, -0.31, 1.03, -0.05, 0.53], 'UNKNOWN': [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00] } } self.pdb_dir = pdb_dir self.contacts_dir = contacts_dir self.get_contacts_path = get_contacts_path self.covalent_bonds = covalent_bonds self.deprotonate = deprotonate if not intramolecular_interactions: self.INTERACTION_TYPES = ['sb', 'pc', 'ps', 'ts', 'vdw', 'hb', 'hbb', 'hbsb', 'hbbb', 'hbss', 'wb', 'wb2', 'hblb', 'hbls', 'lwb', 'lwb2', 'hp'] else: self.INTERACTION_TYPES = intramolecular_interactions self.INTERACTION_FDIM = len(self.INTERACTION_TYPES) # DGL Graph Constructors self.node_featuriser = node_featuriser self.edge_featuriser = edge_featuriser self.graph_constructor = graph_constructor if self.node_featuriser == 'meiler': self.node_fdim = 7 elif self.node_featuriser == 'kidera': self.node_fdim = 10 self.exclude_waters = exclude_waters
[docs] def dgl_graph_from_pdb_code(self, pdb_code=None, file_path=None, chain_selection='all', contact_file=None, edge_construction=['contacts'], encoding=False, k_nn=None, custom_edges=None): """ Produces a DGL graph from a PDB code and a selection of polypeptide chains :param file_path: :type file_path: str :param custom_edges: Pass user-defined custom edges to use in edge construction, defaults to None :type custom_edges: Pandas DataFrame, optional :param edge_construction: Specifies edge construction methods. {'contact', 'distance', 'custom'}, defaults to ['contacts'] :type edge_construction: list :param k_nn: Specifies number of nearest neighbours to make K_NN edges with :type k_nn: int :param encoding: Indicates whether or not node names and labels should be encoded :type encoding: bool :param contact_file: Path to local GetContacts output file, defaults to None :type contact_file: str :param pdb_code: 4 character PDB accession code :type pdb_code: str :param chain_selection: string indicating which chains to select {'A', 'B', 'AB', ..., 'all'}, defaults to 'all' :type chain_selection: list :return: DGLGraph object, nodes populated by residues or atoms as specified in class initialisation """ # Todo Error Handling if pdb_code: assert not file_path, "Do not provide both a PDB file and a file path" if file_path: assert not pdb_code, "Do not provide both a PDB file and a file path" if k_nn: assert 'k_nn' in edge_construction, "If providing KNN edges, include 'k_nn' in the edgge_construction list" if contact_file: assert 'contacts' not in edge_construction, \ 'do not provide a contacts file if not using contacts-based edge construction' if self.granularity == 'atom': g = self._make_atom_graph(file_path) return g if self.granularity == 'ss': raise NotImplementedError # todo SS granularity # get secondary structure nodes # add secondary stucture nodes # get secondary stucture edges # add secondary structure edges pass if self.granularity == 'CA' or 'CB' or 'centroids': # Download PDB if file not found if pdb_code: pdb_path = self.pdb_dir + pdb_code + '.pdb' if not os.path.isfile(pdb_path): self._download_pdb(pdb_code) # Create Relevant protein dataframes df = self._protein_df(pdb_path=self.pdb_dir + pdb_code + '.pdb') chains = self._get_chains(df, chain_selection) df = pd.concat(chains) # Populate graph with nodes g = self._add_protein_nodes(df) # Edge construction: if k_nn: edges = self._k_nn_edges(df, k_nn) g.add_edges(edges['res1'], edges['res2'], data={'k_nn_dist': torch.Tensor(list(edges['distance']))}) if not (contact_file and 'contacts' in edge_construction): self._compute_protein_contacts(pdb_code) if 'contacts' in edge_construction: edges = self._get_protein_edges(pdb_code, chain_selection, contact_file=None) g = self._add_protein_edges_to_graph(g, edges) if self.edge_distance_cutoff and 'distance' in edge_construction: # Get distance-based edges edges = self._distance_based_edges(df, self.edge_distance_cutoff) # Add edges g.add_edges(list(edges['level_0']), list(edges['level_1']), data={'dist': torch.Tensor(list(edges[0]))}) if 'delaunay' in edge_construction: edges = self._get_delaunay_edges(df, furthest_site=False, incremental=False) g.add_edges(list(edges['res1']), list(edges['res2']), data={'delaunay_euclidean_distance': torch.Tensor(list(edges['distance']))}) # Add user supplied edges if custom_edges: g.add_edges(list(custom_edges['res1']), list(custom_edges['res2']), data={'user_edge_data': torch.Tensor(list(custom_edges['data']))}) if self.include_ss: dssp = self._get_protein_features(pdb_code, file_path=None, chain_selection=chain_selection) feats = self._compute_protein_feature_representations(dssp) g = self._add_protein_features(g, feats) # Label Encoding of Node IDs if encoding: resiude_name_encoder = preprocessing.LabelEncoder() residue_id_encoder = preprocessing.LabelEncoder() residue_names = g.ndata['residue_name'] residue_id = g.ndata['id'] g.ndata['residue_name'] = resiude_name_encoder.fit_transform(residue_names) g.ndata['id'] = residue_id_encoder.fit_transform(residue_id) return g, resiude_name_encoder, residue_id_encoder if self.remove_string_labels: g.ndata.pop('residue_name') g.ndata.pop('id') if self.verbose: print(g) return g
[docs] def dgl_graph_from_pdb_file(self, file_path, chain_selection, contact_file, edges=None): """ Produces a DGL graph from a PDB file and a selection of polypeptide chains :param edges: User-defined custom edges, defaults to None :type edges: Pandas DataFram, optional :param contact_file: Path to local GetContacts output file :type contact_file: str :param file_path: 4 character PDB accession code :type file_path: str :param chain_selection: Polypeptide chains in structure to select {'A', 'B', 'AB', ..., 'all} :type chain_selection: str :return: DGLGraph object, nodes populated by residues or atoms as specified in class initialisation :rtype: DGLGraph """ # Atom-level Graph if self.granularity == 'atom': g = self._make_atom_graph(file_path) # Residue-level graph if self.granularity == 'CA' or 'CB': # Pre-process protein Df df = self._protein_df(pdb_path=file_path) chains = self._get_chains(df, chain_selection) df = pd.concat(chains) # Create Graph g = self._add_protein_nodes(df) # Add edges if not contact_file: self._compute_protein_contacts(file_path) if not edges: edges = self._get_protein_edges(file_path, chain_selection, contact_file) g = self._add_protein_edges_to_graph(g, edges) if self.include_ss: dssp = self._get_protein_features(file_path=file_path, pdb_code=None) feats = self._compute_protein_feature_representations(dssp) g = self._add_protein_features(g, feats) return g
[docs] def nx_graph_from_pdb_code(self, pdb_code, chain_selection='all', contact_file=None, edge_construction=['contacts'], encoding=False, k_nn=None, custom_edges=None): """ Produces a NetworkX Graph Object :param encoding: :type bool: :param edges: User-supplied edges, defaults to None :type edges: Pandas DataFrame, optional :param pdb_code: 4 character PDB accession code :type pdb_code: str :param chain_selection: string indicating chain selection {'A', 'B', 'AB', ..., 'all'}, defaults to 'all' :type chain_selection: str :param contact_file: Path to GetContacts output file. :type contact_file: str, optional :return: NetworkX graph object of protein :rtype: NetworkX graph """ assert encoding, 'Non-numeric feature encoding must be True' g, resiude_name_encoder, residue_id_encoder = self.dgl_graph_from_pdb_code(pdb_code=pdb_code, chain_selection=chain_selection, contact_file=contact_file, edge_construction=edge_construction, custom_edges=custom_edges, encoding=encoding, k_nn=k_nn ) node_attrs = g.node_attr_schemes().keys() edge_attrs = g.edge_attr_schemes().keys() return dgl.to_networkx(g, node_attrs, edge_attrs), resiude_name_encoder, residue_id_encoder
[docs] def nx_graph_from_pdb_file(self, pdb_code, chain_selection='all', contact_file=None): """ Produces a NetworkX Graph Object :param pdb_code: 4 character PDB accession code :type pdb_code: str :param chain_selection: string indicating chain selection {'A', 'B', 'AB', ..., 'all'} :type chain_selection: str :param contact_file: Path to GetContacts output file. :type contact_file: str, optional :return: NetworkX graph object of protein """ g, resiude_name_encoder, residue_id_encoder = self.dgl_graph_from_pdb_file(pdb_code, chain_selection, contact_file) node_attrs = g.node_attr_schemes().keys() edge_attrs = g.edge_attr_schemes().keys() return dgl.to_networkx(g, node_attrs, edge_attrs), resiude_name_encoder, residue_id_encoder
[docs] def torch_geometric_graph_from_pdb_code(self, pdb_code, chain_selection='all', edge_construction=['contacts'], contact_file=None, encoding=False, k_nn=None, custom_edges=None): """ Produces a PyToch Geometric Data object from a protein structure :param k_nn: Specifies K nearest neighbours to use in KNN edge construction, defaults to None :type k_nn: int, optional :param custom_edges: User-supplied edges to use, defaults to None :type custom_edges: Pandas DataFrame, optional :param encoding: :type encoding: bool :param edge_construction: List containing edge construction to be used. ['contacts', 'distance', 'delaunay'], defaults to ['contacts'] :type edge_construction: list :param pdb_code: 4-character PDB accession code :type pdb_code: str :param chain_selection: Specifies polypeptide chains to include. e.g. one of {'A', 'B' ,'AB', 'BC'}, defaults to 'all' :type chain_selection: str :param contact_file: Path to contact file if using local file. :type contact_file: str :return: Pytorch Geometric Graph of protein structure. :rtype: PyTorch Geometric Data object """ assert encoding, 'Non-numeric feature encoding must be True' g, resiude_name_encoder, residue_id_encoder = self.dgl_graph_from_pdb_code(pdb_code=pdb_code, chain_selection=chain_selection, contact_file=contact_file, edge_construction=edge_construction, custom_edges=custom_edges, encoding=encoding, k_nn=k_nn) # Get node features from DGL graph and concatenate them node_feature_names = g.node_attr_schemes().keys() dgl_graph_features = [g.ndata[feat].float() for feat in node_feature_names] dgl_graph_features = [f.unsqueeze(dim=1) if len(f.shape) == 1 else f for f in dgl_graph_features] node_features = torch.cat(dgl_graph_features, dim=1) # Get edge features from DGL graph and concatenate them edge_types = g.edge_attr_schemes().keys() edge_feats = [g.edata[e].float() for e in edge_types] edge_feats = [e.unsqueeze(dim=1) if len(e.shape) == 1 else e for e in edge_feats] edge_feats = torch.cat(edge_feats, dim=1) # Create the Torch Geometric graph geom_graph = (Data(x=node_features, edge_index=torch.stack(g.edges(), dim=1), edge_attr=edge_feats )) print(geom_graph) return geom_graph
def _make_atom_graph(self, pdb_code=None, pdb_path=None, node_featurizer=None, edge_featurizer=None, graph_type='bigraph'): """ Create atom-level graph from PDB structure :param graph_type: :param pdb_code: :param pdb_path: :param node_featurizer: :param edge_featurizer: :return: """ if node_featurizer is None: node_featurizer = CanonicalAtomFeaturizer() if edge_featurizer is None: edge_featurizer = CanonicalBondFeaturizer() # Read in protein as mol # if pdb_path: if pdb_code: pdb_path = self.pdb_dir + pdb_code + '.pdb' if not os.path.isfile(pdb_path): self._download_pdb(pdb_code) assert os.path.isfile(pdb_path) mol = MolFromPDBFile(pdb_path) # DGL mol to graph if graph_type == 'bigraph': g = mol_to_bigraph(mol, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer ) elif graph_type == 'complete': g = mol_to_complete_graph(mol, node_featurizer=node_featurizer, ) elif graph_type == 'k_nn': raise NotImplementedError print(g) return g def _protein_df(self, pdb_path): """ Pre-processes protein structure dataframe. :param pdb_path: :param pdb_code - 4 letter PDB accession code :return: 'cleaned protein dataframe' """ protein_df = PandasPdb().read_pdb(pdb_path) atoms = protein_df.df['ATOM'] hetatms = protein_df.df['HETATM'] if self.granularity == 'centroids': if self.deprotonate: atoms = atoms.loc[atoms['atom_name'] != 'H'].reset_index() centroids = self._calculate_centroid_positions(atoms) atoms = atoms.loc[atoms['atom_name'] == 'CA'].reset_index() atoms['x_coord'] = centroids['x_coord'] atoms['y_coord'] = centroids['y_coord'] atoms['z_coord'] = centroids['z_coord'] else: atoms = atoms.loc[atoms['atom_name'] == self.granularity] if self.keep_hets: if self.exclude_waters: hetatms = hetatms.loc[hetatms['residue_name'] != 'HOH'] if self.verbose: print(f'Detected {len(hetatms)} HETATOM nodes') protein_df = pd.concat([atoms, hetatms]) else: protein_df = atoms # Remove alt_loc resdiues protein_df = protein_df.loc[protein_df['alt_loc'].isin(['', 'A'])] if self.verbose: print(f'Detected {len(protein_df)} total nodes') return protein_df def _calculate_centroid_positions(self, atoms): """ Calculates position of sidechain centroids :param atoms: ATOM df of protein structure :return: centroids (df) """ centroids = atoms.groupby('residue_number').mean()[['x_coord', 'y_coord', 'z_coord']].reset_index() if self.verbose: print(f'Calculated {len(centroids)} centroid nodes') return centroids @staticmethod def _get_chains(protein_df, chain_selection): """ Extracts relevant chains from protein_df :param protein_df: pandas dataframe of PDB subsetted to relevant atoms (CA, CB) :param chain_selection: :return """ if chain_selection != 'all': chains = [protein_df.loc[protein_df['chain_id'] == chain] for chain in chain_selection] else: chains = [protein_df.loc[protein_df['chain_id'] == chain] for chain in protein_df['chain_id'].unique()] return chains def _add_protein_nodes(self, chain): """ Add protein nodes to graph from list of PandasPDB dataframes for each chain :param chain: (list of dataframes) Contains a dataframe for each chain in the protein :return: g (DGLGraph): Graph of protein only populated by the nodes """ g = dgl.DGLGraph() nodes = chain['chain_id'] + ':' + chain['residue_name'] + ':' + chain['residue_number'].apply(str) if self.granularity == 'atom': nodes = nodes + ':' + chain['atom_name'] node_features = [self._aa_features(residue, self.node_featuriser) for residue in chain['residue_name']] coords = torch.Tensor(np.asarray(chain[['x_coord', 'y_coord', 'z_coord']])).type('torch.FloatTensor') g.add_nodes(len(nodes), {'id': nodes, 'residue_name': chain['residue_name'], 'h': torch.stack(node_features).type('torch.FloatTensor'), 'coords': coords }) return g def _aa_features(self, residue, embedding): """ Retrieves amino acid embeddings :param residue: str specifying the amino acid :param embedding: embedding to use {'meiler', 'kidera'} :return: features: torch tensor of features """ if residue not in self.embedding_dict[embedding].keys(): residue = 'UNKNOWN' features = torch.Tensor(self.embedding_dict[embedding][residue]).double() return features def _download_pdb(self, pdb_code): """ Download PDB structure from PDB :param pdb_code: 4 character PDB accession code :return: # todo impl return """ # Initialise class and download pdb file pdbl = PDBList() pdbl.retrieve_pdb_file(pdb_code, pdir=self.pdb_dir, overwrite=True, file_format='pdb') # Rename file to .pdb from .ent os.rename(self.pdb_dir + "pdb" + pdb_code + '.ent', self.pdb_dir + pdb_code + '.pdb') # Assert file has been downloaded assert any(pdb_code in s for s in os.listdir(self.pdb_dir)) print(f'Downloaded PDB file for: {pdb_code}') def _compute_protein_contacts(self, pdb_code, file_name=False): """Computes contacts from .pdb file using GetContacts - https://www.github.com/getcontacts/getcontacs :param: pdb_code - 4 character PDB accession code """ # Check for existence of contacts file if file_name: contacts_file = glob.glob(self.contacts_dir + file_name) else: contacts_file = glob.glob(self.contacts_dir + "*" + pdb_code + "*.tsv") if contacts_file: print(f'Contact file found: {contacts_file}') return print(pdb_code) # Check for existence of pdb file pdb_file = glob.glob(self.pdb_dir + "*" + pdb_code + "*.pdb") print(pdb_file) if not pdb_file: # Download PDB file print('PDB file not downloaded') # self.download_pdb(pdb_code) pdb_file = self.pdb_dir + pdb_code + ".pdb" else: pdb_file = pdb_file[0] print(f'PDB file detected: {pdb_file}') # Run GetContacts command = f'{self.get_contacts_path}/get_static_contacts.py ' command += f'--structure {pdb_file} ' command += f'--output {self.contacts_dir + pdb_code + "_contacts.tsv"} ' command += '--itypes all' # --sele "protein"' subprocess.run(command, shell=True) assert os.path.isfile(self.contacts_dir + pdb_code + "_contacts.tsv") print(f'Computed Contacts for: {pdb_code}') def _get_protein_edges(self, pdb_code, chain_selection, contact_file): """ Compute protein edges :param contact_file: :param chain_selection: :param pdb_code: 4 character pdb accession code :return: edges : dataframe containing edges derived from GetContacts analysis # todo impl covalent bond structure """ if not contact_file: contact_file = self.contacts_dir + pdb_code + '_contacts' + '.tsv' edges = set() # Read Contacts File with open(contact_file, 'r') as f: next(f) next(f) for line in f: linfo = line.strip().split('\t') interaction_type = linfo[1] # Select interacting Residues if self.granularity == 'CA' or 'CB' or 'atom': res1 = linfo[2] res2 = linfo[3] if self.granularity != 'atom': res1 = re.search(r'.\:(.*?)\:(.*?)(?=:)', res1)[0] res2 = re.search(r'.\:(.*?)\:(.*?)(?=:)', res2)[0] # Add edge to set of edges edges.add((res1, res2, interaction_type)) edges = pd.DataFrame(list(edges), columns=['res1', 'res2', 'interaction_type']) # Remove all unallowed interactions edges = edges.loc[edges['interaction_type'].isin(self.INTERACTION_TYPES)] if chain_selection != 'all': edges = edges.loc[edges['res1'].str.startswith(tuple(chain_selection))] edges = edges.loc[edges['res2'].str.startswith(tuple(chain_selection))] # Filter out interactions for disordered/unassigned residues edges = edges.loc[~edges['res1'].str.contains("[A-Z]$")] edges = edges.loc[~edges['res2'].str.contains("[A-Z]$")] edges = edges.loc[~edges['res1'].str.contains(":0$")] edges = edges.loc[~edges['res2'].str.contains(":0$")] edges = edges.loc[~edges['res1'].str.contains('^X:')] edges = edges.loc[~edges['res2'].str.contains('^X:')] if self.long_interaction_threshold: res1 = edges['res1'].str.extract('(\d+)').astype(int) res2 = edges['res2'].str.extract('(\d+)').astype(int) inds = abs(res1 - res2) > self.long_interaction_threshold edges = edges[inds] if self.verbose: print(f'Calculated {len(edges)} intramolecular interaction-based edges') return edges def _add_protein_edges_to_graph(self, g, e): """ Add protein edges from dataframe of edges :param g: Dgl graph of protein :param e: Pandas dataframe of edges :return: g DGL Graph with edges added """ if self.granularity == 'dense': g.add_edges( [i for i in range(g.number_of_nodes()) for j in range(g.number_of_nodes() - 1)], [ j for i in range(g.number_of_nodes()) for j in range(g.number_of_nodes()) if i != j ]) return g else: index = dict(zip(list(g.ndata['id']), list(range(len(g.ndata['id']))) )) # Remove interactions for edges between nodes not in graph. E.g hetatms e = e.loc[e['res1'].isin(index.keys())] e = e.loc[e['res2'].isin(index.keys())] res1_ind = [index[res] for res in e['res1']] res2_ind = [index[res] for res in e['res2']] interactions = [self._onek_encoding_unk(interaction, self.INTERACTION_TYPES) for interaction in e['interaction_type']] g.add_edges(res1_ind, res2_ind, {'rel_type': torch.Tensor(interactions).double(), 'norm': torch.ones(len(interactions))}) return g @staticmethod def _onek_encoding_unk(x, allowable_set): """ Function for one hot encoding :param x: value to one-hot :param allowable_set: set of options to encode :return: one-hot encoding as torch tensor """ # if x not in allowable_set: # x = allowable_set[-1] return [x == s for s in allowable_set] def _get_protein_features(self, pdb_code, file_path, chain_selection): """ :param file_path: (str) file path to PDB file :param pdb_code: (str) String containing four letter PDB accession :return df (pd.DataFrame): Dataframe containing output of DSSP (Solvent accessibility, secondary structure for each residue) """ # Run DSSP on relevant PDB file if pdb_code: d = dssp_dict_from_pdb_file(self.pdb_dir + pdb_code + '.pdb') if file_path: d = dssp_dict_from_pdb_file(file_path) # Parse DSSP output to DataFrame appender = [] for k in d[1]: to_append = [] y = d[0][k] chain = k[0] residue = k[1] het = residue[0] resnum = residue[1] icode = residue[2] to_append.extend([chain, resnum, icode]) to_append.extend(y) appender.append(to_append) cols = ['chain', 'resnum', 'icode', 'aa', 'ss', 'exposure_rsa', 'phi', 'psi', 'dssp_index', 'NH_O_1_relidx', 'NH_O_1_energy', 'O_NH_1_relidx', 'O_NH_1_energy', 'NH_O_2_relidx', 'NH_O_2_energy', 'O_NH_2_relidx', 'O_NH_2_energy'] df = pd.DataFrame.from_records(appender, columns=cols) # Subset dataframe to those in chain_selection if chain_selection != 'all': df = df.loc[df['chain'].isin(chain_selection)] # Rename cysteines to 'C' df['aa'] = df['aa'].str.replace('[a-z]', 'C') df = df[df['aa'].isin(list(aa1))] # Drop alt_loc residues df = df.loc[df['icode'] == ' '] # Add additional Columns df['aa_three'] = df['aa'].apply(one_to_three) df['max_acc'] = df['aa_three'].map(residue_max_acc['Sander'].get) df[['exposure_rsa', 'max_acc']] = df[['exposure_rsa', 'max_acc']].astype(float) df['exposure_asa'] = df['exposure_rsa'] * df['max_acc'] df['index'] = df['chain'] + ':' + df['aa_three'] + ':' + df['resnum'].apply(str) return df def _compute_protein_feature_representations(self, dssp_df): """ :param dssp_df: (pd.DataFrame): Df containing parsed output of DSSP :return feature_dict (dict): Dictionary of tensorized features """ # One hot encoded secondary structure assignments ss_set = ['G', 'H', 'I', 'E', 'B', 'T', 'S', 'C', '-'] ss = [self._onek_encoding_unk(ss, ss_set) for ss in dssp_df['ss']] # Create feature dictionary feature_dict = {'ss': torch.Tensor(ss), 'asa': torch.Tensor(np.asarray(dssp_df['exposure_asa'])).reshape(len(dssp_df), 1), 'rsa': torch.Tensor(np.asarray(dssp_df['exposure_rsa'])).reshape(len(dssp_df), 1) } return feature_dict @staticmethod def _add_protein_features(g, feature_dict): """ Add computed protein features to graph :param g: DGL Graph of protein. :param feature_dict: Dictionary of features calculated by DSSP :return: g DGL Graph of protein with SS and solvent accessibility features added to node data """ # 0 Pad Tensors for Proteins with HETATMS that DSSP Can't Deal with pad_length = len(g.ndata['h']) - len(feature_dict['ss']) if pad_length > 0: pad = [0, 0, 0, pad_length] feature_dict['ss'] = F.pad(feature_dict['ss'], pad, 'constant', 0) feature_dict['asa'] = F.pad(feature_dict['asa'], pad, 'constant', 0) feature_dict['rsa'] = F.pad(feature_dict['rsa'], pad, 'constant', 0) # Assign Features g.ndata['ss'] = feature_dict['ss'] g.ndata['asa'] = feature_dict['asa'] g.ndata['rsa'] = feature_dict['rsa'] return g def _k_nn_edges(self, protein_df, k, mode='connectivity', metric='minkowski', p=2, include_self=False): """ Construct edges based on K nearest neighbours :param protein_df: PandasPDB DF of protein structure :param k: number of nearest neighbour edges for each node :param mode: {'connectivity', 'distance'} :param metric: {'minkowskii} :param p: :param include_self: bool - whether or not to include self-loops :return: """ # Create distance matrix coords = protein_df[['x_coord', 'y_coord', 'z_coord']] # dists = pairwise_distances(np.asarray(coords)) dists = np.asarray(coords) # Perform K-NN on coordinates nn = kneighbors_graph(X=dists, n_neighbors=k, mode=mode, metric=metric, p=p, include_self=include_self ) # Create dataframe of edges outgoing = np.repeat(np.array(range(len(coords))), k) incoming = nn.indices edge_df = pd.DataFrame({'res1': outgoing, 'res2': incoming, 'distance': nn.data }) if self.long_interaction_threshold: edge_df = edge_df.loc[abs(abs(edge_df['res1']) - abs(edge_df['res2'])) > self.long_interaction_threshold] if self.verbose: print(f'Calculated {len(edge_df)} K-nearest neighbour edges') return edge_df def _distance_based_edges(self, protein_df, cutoff): """ Calculate distance-based edges from coordinates in 3D structure. Produce Edge list dataframe based on pairwise distance matrix calculation :param protein_df: PandasPDB Dataframe :param cutoff: Distance threshold to create an edge (Angstroms) :return: dists : pandas dataframe of edge list and distance """ # Create distance matrix coords = protein_df[['x_coord', 'y_coord', 'z_coord']] dists = pairwise_distances(np.asarray(coords)) # Filter distance matrix and select lower triangle dists = pd.DataFrame(np.tril(np.where(dists < cutoff, dists, 0))) # Reshape to produce edge list dists.values[[np.arange(len(dists))] * 2] = np.nan dists = dists.stack().reset_index() # Filter to remove edges that exceed cutoff dists = dists.loc[dists[0] != 0] if self.long_interaction_threshold: dists = dists.loc[abs(abs(dists['level_0']) - abs(dists['level_1'])) > self.long_interaction_threshold] if self.verbose: print(f'Calcuclated {len(dists)} distance-based edges') return dists """" @staticmethod def get_voronoi_edges(protein_df, furthest_site=False, incremental=False): #Calculate Voronoi edges from protein dataframe #:param protein_df: #:param furthest_site: ##:param incremental: #:return: coord = protein_df[['x_coord', 'y_coord', 'z_coord']] vor = spatial.Voronoi(points=coord, furthest_site=furthest_site, incremental=incremental) edges = pd.DataFrame(vor.ridge_points) print(edges) edges.columns = ['res1', 'res2'] print(f'Calculated {len(edges)} voronoi-ridge edges') return edges """ def _get_delaunay_edges(self, protein_df, furthest_site=False, incremental=False): """ Calculate Delaunay edges from a dataframe of coordinates :param protein_df: :param furthest_site: :param incremental: :return: """ coord = protein_df[['x_coord', 'y_coord', 'z_coord']] delaunay = spatial.Delaunay(coord, furthest_site=furthest_site, incremental=incremental) # Turn simplices into edgelist edges = [] indices, indptr = delaunay.vertex_neighbor_vertices for i in range(indices.shape[0] - 1): for j in indptr[indices[i]:indices[i + 1]]: try: edges.append([i, j]) except IndexError: pass # Create edge DataFrame edge_df = pd.DataFrame(edges) edge_df.columns = ['res1', 'res2'] # Get distances between edges distances = [] for row in range(len(edge_df)): a = coord.iloc[edge_df.iloc[row]['res1']] b = coord.iloc[edge_df.iloc[row]['res2']] distances.append(spatial.distance.euclidean(a, b)) edge_df['distance'] = distances if self.long_interaction_threshold: edge_df = edge_df.loc[abs(abs(edge_df['res1']) - abs(edge_df['res2'])) > self.long_interaction_threshold] if self.verbose: print(f'Calculated {len(edge_df)} Delaunay edges') return edge_df
if __name__ == "__main__": """ pg = ProteinGraph(granularity='CA', insertions=False, keep_hets=True, node_featuriser='meiler', allowed_interactions=None, get_contacts_path='/home/arj39/Documents/github/getcontacts', pdb_dir='/home/arj39/Documents/test/pdb/', contacts_dir='/home/arj39/Documents/test/contacts/', exclude_waters=True, covalent_bonds=False, include_ss=True, include_ligand=False, edge_distance_cutoff=5 # node_featuriser=dgl.data.chem.atom_type_one_hot(), # edge_featuriser=dgl.data.chem.bond_type_one_hot(), # graph_constructor=dgl.data.chem.mol_to_graph()) ) """ """ pg = ProteinGraph(granularity='CA', insertions=False, keep_hets=True, node_featuriser='meiler', allowed_interactions=None, get_contacts_path='/home/arj39/Documents/github/getcontacts', pdb_dir='/home/arj39/Documents/test/pdb/', contacts_dir='/home/arj39/Documents/test/contacts/', exclude_waters=True, covalent_bonds=False, include_ss=True, include_ligand=False, edge_distance_cutoff=10 # node_featuriser=dgl.data.chem.atom_type_one_hot(), # edge_featuriser=dgl.data.chem.bond_type_one_hot(), # graph_constructor=dgl.data.chem.mol_to_graph()) ) """ pg = ProteinGraph(granularity='CA', insertions=False, keep_hets=False, node_featuriser='meiler', intramolecular_interactions=None, get_contacts_path='/Users/arianjamasb/github/getcontacts', pdb_dir='../examples/pdbs/', contacts_dir='../examples/contacts/', exclude_waters=True, covalent_bonds=False, include_ss=True, include_ligand=False, verbose=True, long_interaction_threshold=5, edge_distance_cutoff=10 ) """ g = pg.dgl_graph_from_pdb_code('3eiy', chain_selection='all', edge_construction=['distance', 'delaunay'], # , 'delaunay', 'k_nn'], encoding=False, k_nn=None) pg.torch_geometric_graph_from_pdb_code('3eiy', chain_selection='all', edge_construction=['distance', 'delaunay'], encoding=True, k_nn=None) """ g, _, __ = pg.nx_graph_from_pdb_code(pdb_code='3eiy', chain_selection='all', edge_construction=['contacts'], encoding=True) #pg.make_atom_graph(pdb_code='3eiy') # Check KNN # g, resiude_name_encoder, residue_id_encoder = pg.nx_graph_from_pdb_code('3eiy', chain_selection='all', # edge_construction=['distance', 'contacts'], # encoding=True)