"""Class for working with Protein Structure Graphs"""
# %%
# Graphein
# Author: Arian Jamasb <[email protected]>
# License: MIT
# Project Website: https://github.com/a-r-j/graphein
# Code Repository: https://github.com/a-r-j/graphein
import os
import glob
import re
import pandas as pd
import numpy as np
import dgl
import subprocess
import networkx as nx
import torch as torch
import torch.nn.functional as F
from torch_geometric.data import Data
from biopandas.pdb import PandasPdb
from Bio.PDB import *
from Bio.PDB.DSSP import residue_max_acc, dssp_dict_from_pdb_file
from Bio.PDB.Polypeptide import aa1, one_to_three
from dgllife.utils import mol_to_bigraph, mol_to_complete_graph, mol_to_nearest_neighbor_graph
from dgllife.utils import BaseBondFeaturizer, BaseAtomFeaturizer, CanonicalAtomFeaturizer, CanonicalBondFeaturizer
from rdkit.Chem import MolFromPDBFile
from sklearn.metrics import pairwise_distances
from sklearn import preprocessing
from sklearn.neighbors import kneighbors_graph
from scipy import spatial
# Todo add SS featuriser for Mol Graph?
# Todo atom featuriser
# Todo create SS element-level graph
# Mol graph Nearest Neighbour
[docs]class ProteinGraph(object):
[docs] def __init__(self, granularity, keep_hets, insertions, node_featuriser, get_contacts_path, pdb_dir,
contacts_dir, exclude_waters=True, covalent_bonds=True, include_ss=True, include_ligand=False,
intramolecular_interactions=None, graph_constructor=None, edge_featuriser=None,
edge_distance_cutoff=None, verbose=True, deprotonate=False, remove_string_labels=False,
long_interaction_threshold=None):
"""
Initialise ProteinGraph Generator Class
:param granularity: Specifies granularity of the graph construction. {'atom', 'CA', 'CB'}. CA = Alpha Carbon, CB = Beta Carbon
:type granularity: str
:param keep_hets: Keep heteroatoms present in the PDB file. Typically, these correspond to metal ions or modified residues (e.g. MSE)
:type keep_hets: bool
:param insertions: Keep atoms/residues with multiple insertion positions. Multiple insertions exist when the electron density is too vague to define a single insertion
:type insertions: bool
:param node_featuriser: DGL Node featuriser for atom-level graphs. Canonical Featurises recommended.
:type node_featuriser: DGL Node Featuriser
:param pdb_dir: Directory to PDB files. We will download .PDB files to this folder if you don't have an existing local copy of the requisite structure
:type pdb_dir: str
:param contacts_dir: Directory to GetContacts files
:type contacts_dir: str
:param exclude_waters: Specifies inclusion of water molecules. Not yet fully operational.
:type exclude_waters: bool
:param covalent_bonds: Specifies inclusion of covalent backbone. E.g. joins adjacent residues in the sequence
:type covalent_bonds: bool
:param include_ss: Specifies inclusion of secondary structure features computed by DSSP. Future warning: this will be changed in a subsequent update for managing feature selection.
:type include_ss: bool
:param include_ligand: Not yet implemented. Will specify option to include bound ligand(s) in the graph.
:type include_ligand: bool
:param intramolecular_interactions: List of allowable intramolecular interactions to include from GetContacts. ['sb', 'pc', 'ps', 'ts', 'vdw', 'hb', 'hbb', 'hbsb', 'hbbb', 'hbss', 'wb', 'wb2',
'hblb', 'hbls', 'lwb', 'lwb2', 'hp']. See https://getcontacts.github.io/interactions.html for details.
:type intramolecular_interactions: list
:param edge_distance_cutoff: Distance in angstroms specifying cutoff distance for constructing an edge when using distance construction
:type edge_distance_cutoff: float
:param long_interaction_threshold: Specifies minimum distance in sequence for two nodes to be connected
:type long_interaction_threshold: int
"""
self.long_interaction_threshold = long_interaction_threshold
self.remove_string_labels = remove_string_labels
self.verbose = verbose
self.edge_distance_cutoff = edge_distance_cutoff
self.include_ligand = include_ligand
self.include_ss = include_ss
self.granularity = granularity
self.keep_hets = keep_hets
self.insertions = insertions
self.node_featuriser = node_featuriser
self.embedding_dict = {
'meiler': {
'ALA': [1.28, 0.05, 1.00, 0.31, 6.11, 0.42, 0.23],
'GLY': [0.00, 0.00, 0.00, 0.00, 6.07, 0.13, 0.15],
'VAL': [3.67, 0.14, 3.00, 1.22, 6.02, 0.27, 0.49],
'LEU': [2.59, 0.19, 4.00, 1.70, 6.04, 0.39, 0.31],
'ILE': [4.19, 0.19, 4.00, 1.80, 6.04, 0.30, 0.45],
'PHE': [2.94, 0.29, 5.89, 1.79, 5.67, 0.30, 0.38],
'TYR': [2.94, 0.30, 6.47, 0.96, 5.66, 0.25, 0.41],
'PTR': [2.94, 0.30, 6.47, 0.96, 5.66, 0.25, 0.41],
'TRP': [3.21, 0.41, 8.08, 2.25, 5.94, 0.32, 0.42],
'THR': [3.03, 0.11, 2.60, 0.26, 5.60, 0.21, 0.36],
'TPO': [3.03, 0.11, 2.60, 0.26, 5.60, 0.21, 0.36],
'SER': [1.31, 0.06, 1.60, -0.04, 5.70, 0.20, 0.28],
'SEP': [1.31, 0.06, 1.60, -0.04, 5.70, 0.20, 0.28],
'ARG': [2.34, 0.29, 6.13, -1.01, 10.74, 0.36, 0.25],
'LYS': [1.89, 0.22, 4.77, -0.99, 9.99, 0.32, 0.27],
'KCX': [1.89, 0.22, 4.77, -0.99, 9.99, 0.32, 0.27],
'LLP': [1.89, 0.22, 4.77, -0.99, 9.99, 0.32, 0.27],
'HIS': [2.99, 0.23, 4.66, 0.13, 7.69, 0.27, 0.30],
'ASP': [1.60, 0.11, 2.78, -0.77, 2.95, 0.25, 0.20],
'GLU': [1.56, 0.15, 3.78, -0.64, 3.09, 0.42, 0.21],
'PCA': [1.56, 0.15, 3.78, -0.64, 3.09, 0.42, 0.21],
'ASN': [1.60, 0.13, 2.95, -0.60, 6.52, 0.21, 0.22],
'GLN': [1.56, 0.18, 3.95, -0.22, 5.65, 0.36, 0.25],
'MET': [2.35, 0.22, 4.43, 1.23, 5.71, 0.38, 0.32],
'MSE': [2.35, 0.22, 4.43, 1.23, 5.71, 0.38, 0.32],
'PRO': [2.67, 0.00, 2.72, 0.72, 6.80, 0.13, 0.34],
'CYS': [1.77, 0.13, 2.43, 1.54, 6.35, 0.17, 0.41],
'CSO': [1.77, 0.13, 2.43, 1.54, 6.35, 0.17, 0.41],
'CAS': [1.77, 0.13, 2.43, 1.54, 6.35, 0.17, 0.41],
'CAF': [1.77, 0.13, 2.43, 1.54, 6.35, 0.17, 0.41],
'CSD': [1.77, 0.13, 2.43, 1.54, 6.35, 0.17, 0.41],
'UNKNOWN': [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00]
},
'kidera': {
'A': [-1.56, -1.67, -0.97, -0.27, -0.93, -0.78, -0.2, -0.08, 0.21, -0.48],
'C': [0.12, -0.89, 0.45, -1.05, -0.71, 2.41, 1.52, -0.69, 1.13, 1.1],
'E': [-1.45, 0.19, -1.61, 1.17, -1.31, 0.4, 0.04, 0.38, -0.35, -0.12],
'D': [0.58, -0.22, -1.58, 0.81, -0.92, 0.15, -1.52, 0.47, 0.76, 0.7],
'G': [1.46, -1.96, -0.23, -0.16, 0.1, -0.11, 1.32, 2.36, -1.66, 0.46],
'F': [-0.21, 0.98, -0.36, -1.43, 0.22, -0.81, 0.67, 1.1, 1.71, -0.44],
'I': [-0.73, -0.16, 1.79, -0.77, -0.54, 0.03, -0.83, 0.51, 0.66, -1.78],
'H': [-0.41, 0.52, -0.28, 0.28, 1.61, 1.01, -1.85, 0.47, 1.13, 1.63],
'K': [-0.34, 0.82, -0.23, 1.7, 1.54, -1.62, 1.15, -0.08, -0.48, 0.6],
'M': [-1.4, 0.18, -0.42, -0.73, 2.0, 1.52, 0.26, 0.11, -1.27, 0.27],
'L': [-1.04, 0.0, -0.24, -1.1, -0.55, -2.05, 0.96, -0.76, 0.45, 0.93],
'N': [1.14, -0.07, -0.12, 0.81, 0.18, 0.37, -0.09, 1.23, 1.1, -1.73],
'Q': [-0.47, 0.24, 0.07, 1.1, 1.1, 0.59, 0.84, -0.71, -0.03, -2.33],
'P': [2.06, -0.33, -1.15, -0.75, 0.88, -0.45, 0.3, -2.3, 0.74, -0.28],
'S': [0.81, -1.08, 0.16, 0.42, -0.21, -0.43, -1.89, -1.15, -0.97, -0.23],
'R': [0.22, 1.27, 1.37, 1.87, -1.7, 0.46, 0.92, -0.39, 0.23, 0.93],
'T': [0.26, -0.7, 1.21, 0.63, -0.1, 0.21, 0.24, -1.15, -0.56, 0.19],
'W': [0.3, 2.1, -0.72, -1.57, -1.16, 0.57, -0.48, -0.4, -2.3, -0.6],
'V': [-0.74, -0.71, 2.04, -0.4, 0.5, -0.81, -1.07, 0.06, -0.46, 0.65],
'Y': [1.38, 1.48, 0.8, -0.56, -0.0, -0.68, -0.31, 1.03, -0.05, 0.53],
'UNKNOWN': [0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00]
}
}
self.pdb_dir = pdb_dir
self.contacts_dir = contacts_dir
self.get_contacts_path = get_contacts_path
self.covalent_bonds = covalent_bonds
self.deprotonate = deprotonate
if not intramolecular_interactions:
self.INTERACTION_TYPES = ['sb', 'pc', 'ps', 'ts', 'vdw', 'hb', 'hbb', 'hbsb', 'hbbb', 'hbss', 'wb', 'wb2',
'hblb', 'hbls', 'lwb', 'lwb2', 'hp']
else:
self.INTERACTION_TYPES = intramolecular_interactions
self.INTERACTION_FDIM = len(self.INTERACTION_TYPES)
# DGL Graph Constructors
self.node_featuriser = node_featuriser
self.edge_featuriser = edge_featuriser
self.graph_constructor = graph_constructor
if self.node_featuriser == 'meiler':
self.node_fdim = 7
elif self.node_featuriser == 'kidera':
self.node_fdim = 10
self.exclude_waters = exclude_waters
[docs] def dgl_graph_from_pdb_code(self, pdb_code=None, file_path=None, chain_selection='all', contact_file=None,
edge_construction=['contacts'], encoding=False, k_nn=None, custom_edges=None):
"""
Produces a DGL graph from a PDB code and a selection of polypeptide chains
:param file_path:
:type file_path: str
:param custom_edges: Pass user-defined custom edges to use in edge construction, defaults to None
:type custom_edges: Pandas DataFrame, optional
:param edge_construction: Specifies edge construction methods. {'contact', 'distance', 'custom'}, defaults to ['contacts']
:type edge_construction: list
:param k_nn: Specifies number of nearest neighbours to make K_NN edges with
:type k_nn: int
:param encoding: Indicates whether or not node names and labels should be encoded
:type encoding: bool
:param contact_file: Path to local GetContacts output file, defaults to None
:type contact_file: str
:param pdb_code: 4 character PDB accession code
:type pdb_code: str
:param chain_selection: string indicating which chains to select {'A', 'B', 'AB', ..., 'all'}, defaults to 'all'
:type chain_selection: list
:return: DGLGraph object, nodes populated by residues or atoms as specified in class initialisation
"""
# Todo Error Handling
if pdb_code:
assert not file_path, "Do not provide both a PDB file and a file path"
if file_path:
assert not pdb_code, "Do not provide both a PDB file and a file path"
if k_nn:
assert 'k_nn' in edge_construction, "If providing KNN edges, include 'k_nn' in the edgge_construction list"
if contact_file:
assert 'contacts' not in edge_construction, \
'do not provide a contacts file if not using contacts-based edge construction'
if self.granularity == 'atom':
g = self._make_atom_graph(file_path)
return g
if self.granularity == 'ss':
raise NotImplementedError
# todo SS granularity
# get secondary structure nodes
# add secondary stucture nodes
# get secondary stucture edges
# add secondary structure edges
pass
if self.granularity == 'CA' or 'CB' or 'centroids':
# Download PDB if file not found
if pdb_code:
pdb_path = self.pdb_dir + pdb_code + '.pdb'
if not os.path.isfile(pdb_path):
self._download_pdb(pdb_code)
# Create Relevant protein dataframes
df = self._protein_df(pdb_path=self.pdb_dir + pdb_code + '.pdb')
chains = self._get_chains(df, chain_selection)
df = pd.concat(chains)
# Populate graph with nodes
g = self._add_protein_nodes(df)
# Edge construction:
if k_nn:
edges = self._k_nn_edges(df, k_nn)
g.add_edges(edges['res1'],
edges['res2'],
data={'k_nn_dist': torch.Tensor(list(edges['distance']))})
if not (contact_file and 'contacts' in edge_construction):
self._compute_protein_contacts(pdb_code)
if 'contacts' in edge_construction:
edges = self._get_protein_edges(pdb_code, chain_selection, contact_file=None)
g = self._add_protein_edges_to_graph(g, edges)
if self.edge_distance_cutoff and 'distance' in edge_construction:
# Get distance-based edges
edges = self._distance_based_edges(df, self.edge_distance_cutoff)
# Add edges
g.add_edges(list(edges['level_0']),
list(edges['level_1']),
data={'dist': torch.Tensor(list(edges[0]))})
if 'delaunay' in edge_construction:
edges = self._get_delaunay_edges(df, furthest_site=False, incremental=False)
g.add_edges(list(edges['res1']),
list(edges['res2']),
data={'delaunay_euclidean_distance': torch.Tensor(list(edges['distance']))})
# Add user supplied edges
if custom_edges:
g.add_edges(list(custom_edges['res1']),
list(custom_edges['res2']),
data={'user_edge_data': torch.Tensor(list(custom_edges['data']))})
if self.include_ss:
dssp = self._get_protein_features(pdb_code, file_path=None, chain_selection=chain_selection)
feats = self._compute_protein_feature_representations(dssp)
g = self._add_protein_features(g, feats)
# Label Encoding of Node IDs
if encoding:
resiude_name_encoder = preprocessing.LabelEncoder()
residue_id_encoder = preprocessing.LabelEncoder()
residue_names = g.ndata['residue_name']
residue_id = g.ndata['id']
g.ndata['residue_name'] = resiude_name_encoder.fit_transform(residue_names)
g.ndata['id'] = residue_id_encoder.fit_transform(residue_id)
return g, resiude_name_encoder, residue_id_encoder
if self.remove_string_labels:
g.ndata.pop('residue_name')
g.ndata.pop('id')
if self.verbose:
print(g)
return g
[docs] def dgl_graph_from_pdb_file(self, file_path, chain_selection, contact_file, edges=None):
"""
Produces a DGL graph from a PDB file and a selection of polypeptide chains
:param edges: User-defined custom edges, defaults to None
:type edges: Pandas DataFram, optional
:param contact_file: Path to local GetContacts output file
:type contact_file: str
:param file_path: 4 character PDB accession code
:type file_path: str
:param chain_selection: Polypeptide chains in structure to select {'A', 'B', 'AB', ..., 'all}
:type chain_selection: str
:return: DGLGraph object, nodes populated by residues or atoms as specified in class initialisation
:rtype: DGLGraph
"""
# Atom-level Graph
if self.granularity == 'atom':
g = self._make_atom_graph(file_path)
# Residue-level graph
if self.granularity == 'CA' or 'CB':
# Pre-process protein Df
df = self._protein_df(pdb_path=file_path)
chains = self._get_chains(df, chain_selection)
df = pd.concat(chains)
# Create Graph
g = self._add_protein_nodes(df)
# Add edges
if not contact_file:
self._compute_protein_contacts(file_path)
if not edges:
edges = self._get_protein_edges(file_path, chain_selection, contact_file)
g = self._add_protein_edges_to_graph(g, edges)
if self.include_ss:
dssp = self._get_protein_features(file_path=file_path, pdb_code=None)
feats = self._compute_protein_feature_representations(dssp)
g = self._add_protein_features(g, feats)
return g
[docs] def nx_graph_from_pdb_code(self, pdb_code, chain_selection='all', contact_file=None, edge_construction=['contacts'],
encoding=False, k_nn=None, custom_edges=None):
"""
Produces a NetworkX Graph Object
:param encoding:
:type bool:
:param edges: User-supplied edges, defaults to None
:type edges: Pandas DataFrame, optional
:param pdb_code: 4 character PDB accession code
:type pdb_code: str
:param chain_selection: string indicating chain selection {'A', 'B', 'AB', ..., 'all'}, defaults to 'all'
:type chain_selection: str
:param contact_file: Path to GetContacts output file.
:type contact_file: str, optional
:return: NetworkX graph object of protein
:rtype: NetworkX graph
"""
assert encoding, 'Non-numeric feature encoding must be True'
g, resiude_name_encoder, residue_id_encoder = self.dgl_graph_from_pdb_code(pdb_code=pdb_code,
chain_selection=chain_selection,
contact_file=contact_file,
edge_construction=edge_construction,
custom_edges=custom_edges,
encoding=encoding,
k_nn=k_nn
)
node_attrs = g.node_attr_schemes().keys()
edge_attrs = g.edge_attr_schemes().keys()
return dgl.to_networkx(g, node_attrs, edge_attrs), resiude_name_encoder, residue_id_encoder
[docs] def nx_graph_from_pdb_file(self, pdb_code, chain_selection='all', contact_file=None):
"""
Produces a NetworkX Graph Object
:param pdb_code: 4 character PDB accession code
:type pdb_code: str
:param chain_selection: string indicating chain selection {'A', 'B', 'AB', ..., 'all'}
:type chain_selection: str
:param contact_file: Path to GetContacts output file.
:type contact_file: str, optional
:return: NetworkX graph object of protein
"""
g, resiude_name_encoder, residue_id_encoder = self.dgl_graph_from_pdb_file(pdb_code, chain_selection,
contact_file)
node_attrs = g.node_attr_schemes().keys()
edge_attrs = g.edge_attr_schemes().keys()
return dgl.to_networkx(g, node_attrs, edge_attrs), resiude_name_encoder, residue_id_encoder
[docs] def torch_geometric_graph_from_pdb_code(self, pdb_code, chain_selection='all', edge_construction=['contacts'],
contact_file=None, encoding=False, k_nn=None, custom_edges=None):
"""
Produces a PyToch Geometric Data object from a protein structure
:param k_nn: Specifies K nearest neighbours to use in KNN edge construction, defaults to None
:type k_nn: int, optional
:param custom_edges: User-supplied edges to use, defaults to None
:type custom_edges: Pandas DataFrame, optional
:param encoding:
:type encoding: bool
:param edge_construction: List containing edge construction to be used. ['contacts', 'distance', 'delaunay'], defaults to ['contacts']
:type edge_construction: list
:param pdb_code: 4-character PDB accession code
:type pdb_code: str
:param chain_selection: Specifies polypeptide chains to include. e.g. one of {'A', 'B' ,'AB', 'BC'}, defaults to 'all'
:type chain_selection: str
:param contact_file: Path to contact file if using local file.
:type contact_file: str
:return: Pytorch Geometric Graph of protein structure.
:rtype: PyTorch Geometric Data object
"""
assert encoding, 'Non-numeric feature encoding must be True'
g, resiude_name_encoder, residue_id_encoder = self.dgl_graph_from_pdb_code(pdb_code=pdb_code,
chain_selection=chain_selection,
contact_file=contact_file,
edge_construction=edge_construction,
custom_edges=custom_edges,
encoding=encoding,
k_nn=k_nn)
# Get node features from DGL graph and concatenate them
node_feature_names = g.node_attr_schemes().keys()
dgl_graph_features = [g.ndata[feat].float() for feat in node_feature_names]
dgl_graph_features = [f.unsqueeze(dim=1) if len(f.shape) == 1 else f for f in dgl_graph_features]
node_features = torch.cat(dgl_graph_features, dim=1)
# Get edge features from DGL graph and concatenate them
edge_types = g.edge_attr_schemes().keys()
edge_feats = [g.edata[e].float() for e in edge_types]
edge_feats = [e.unsqueeze(dim=1) if len(e.shape) == 1 else e for e in edge_feats]
edge_feats = torch.cat(edge_feats, dim=1)
# Create the Torch Geometric graph
geom_graph = (Data(x=node_features,
edge_index=torch.stack(g.edges(), dim=1),
edge_attr=edge_feats
))
print(geom_graph)
return geom_graph
def _make_atom_graph(self, pdb_code=None, pdb_path=None, node_featurizer=None, edge_featurizer=None,
graph_type='bigraph'):
"""
Create atom-level graph from PDB structure
:param graph_type:
:param pdb_code:
:param pdb_path:
:param node_featurizer:
:param edge_featurizer:
:return:
"""
if node_featurizer is None:
node_featurizer = CanonicalAtomFeaturizer()
if edge_featurizer is None:
edge_featurizer = CanonicalBondFeaturizer()
# Read in protein as mol
# if pdb_path:
if pdb_code:
pdb_path = self.pdb_dir + pdb_code + '.pdb'
if not os.path.isfile(pdb_path):
self._download_pdb(pdb_code)
assert os.path.isfile(pdb_path)
mol = MolFromPDBFile(pdb_path)
# DGL mol to graph
if graph_type == 'bigraph':
g = mol_to_bigraph(mol,
node_featurizer=node_featurizer,
edge_featurizer=edge_featurizer
)
elif graph_type == 'complete':
g = mol_to_complete_graph(mol,
node_featurizer=node_featurizer,
)
elif graph_type == 'k_nn':
raise NotImplementedError
print(g)
return g
def _protein_df(self, pdb_path):
"""
Pre-processes protein structure dataframe.
:param pdb_path:
:param pdb_code - 4 letter PDB accession code
:return: 'cleaned protein dataframe'
"""
protein_df = PandasPdb().read_pdb(pdb_path)
atoms = protein_df.df['ATOM']
hetatms = protein_df.df['HETATM']
if self.granularity == 'centroids':
if self.deprotonate:
atoms = atoms.loc[atoms['atom_name'] != 'H'].reset_index()
centroids = self._calculate_centroid_positions(atoms)
atoms = atoms.loc[atoms['atom_name'] == 'CA'].reset_index()
atoms['x_coord'] = centroids['x_coord']
atoms['y_coord'] = centroids['y_coord']
atoms['z_coord'] = centroids['z_coord']
else:
atoms = atoms.loc[atoms['atom_name'] == self.granularity]
if self.keep_hets:
if self.exclude_waters:
hetatms = hetatms.loc[hetatms['residue_name'] != 'HOH']
if self.verbose:
print(f'Detected {len(hetatms)} HETATOM nodes')
protein_df = pd.concat([atoms, hetatms])
else:
protein_df = atoms
# Remove alt_loc resdiues
protein_df = protein_df.loc[protein_df['alt_loc'].isin(['', 'A'])]
if self.verbose:
print(f'Detected {len(protein_df)} total nodes')
return protein_df
def _calculate_centroid_positions(self, atoms):
"""
Calculates position of sidechain centroids
:param atoms: ATOM df of protein structure
:return: centroids (df)
"""
centroids = atoms.groupby('residue_number').mean()[['x_coord', 'y_coord', 'z_coord']].reset_index()
if self.verbose:
print(f'Calculated {len(centroids)} centroid nodes')
return centroids
@staticmethod
def _get_chains(protein_df, chain_selection):
"""
Extracts relevant chains from protein_df
:param protein_df: pandas dataframe of PDB subsetted to relevant atoms (CA, CB)
:param chain_selection:
:return
"""
if chain_selection != 'all':
chains = [protein_df.loc[protein_df['chain_id'] == chain] for chain in chain_selection]
else:
chains = [protein_df.loc[protein_df['chain_id'] == chain] for chain in protein_df['chain_id'].unique()]
return chains
def _add_protein_nodes(self, chain):
"""
Add protein nodes to graph from list of PandasPDB dataframes for each chain
:param chain: (list of dataframes) Contains a dataframe for each chain in the protein
:return: g (DGLGraph): Graph of protein only populated by the nodes
"""
g = dgl.DGLGraph()
nodes = chain['chain_id'] + ':' + chain['residue_name'] + ':' + chain['residue_number'].apply(str)
if self.granularity == 'atom':
nodes = nodes + ':' + chain['atom_name']
node_features = [self._aa_features(residue, self.node_featuriser) for residue in chain['residue_name']]
coords = torch.Tensor(np.asarray(chain[['x_coord', 'y_coord', 'z_coord']])).type('torch.FloatTensor')
g.add_nodes(len(nodes),
{'id': nodes,
'residue_name': chain['residue_name'],
'h': torch.stack(node_features).type('torch.FloatTensor'),
'coords': coords
})
return g
def _aa_features(self, residue, embedding):
"""
Retrieves amino acid embeddings
:param residue: str specifying the amino acid
:param embedding: embedding to use {'meiler', 'kidera'}
:return: features: torch tensor of features
"""
if residue not in self.embedding_dict[embedding].keys():
residue = 'UNKNOWN'
features = torch.Tensor(self.embedding_dict[embedding][residue]).double()
return features
def _download_pdb(self, pdb_code):
"""
Download PDB structure from PDB
:param pdb_code: 4 character PDB accession code
:return: # todo impl return
"""
# Initialise class and download pdb file
pdbl = PDBList()
pdbl.retrieve_pdb_file(pdb_code, pdir=self.pdb_dir, overwrite=True, file_format='pdb')
# Rename file to .pdb from .ent
os.rename(self.pdb_dir + "pdb" + pdb_code + '.ent', self.pdb_dir + pdb_code + '.pdb')
# Assert file has been downloaded
assert any(pdb_code in s for s in os.listdir(self.pdb_dir))
print(f'Downloaded PDB file for: {pdb_code}')
def _compute_protein_contacts(self, pdb_code, file_name=False):
"""Computes contacts from .pdb file using GetContacts - https://www.github.com/getcontacts/getcontacs
:param: pdb_code - 4 character PDB accession code
"""
# Check for existence of contacts file
if file_name:
contacts_file = glob.glob(self.contacts_dir + file_name)
else:
contacts_file = glob.glob(self.contacts_dir + "*" + pdb_code + "*.tsv")
if contacts_file:
print(f'Contact file found: {contacts_file}')
return
print(pdb_code)
# Check for existence of pdb file
pdb_file = glob.glob(self.pdb_dir + "*" + pdb_code + "*.pdb")
print(pdb_file)
if not pdb_file:
# Download PDB file
print('PDB file not downloaded')
# self.download_pdb(pdb_code)
pdb_file = self.pdb_dir + pdb_code + ".pdb"
else:
pdb_file = pdb_file[0]
print(f'PDB file detected: {pdb_file}')
# Run GetContacts
command = f'{self.get_contacts_path}/get_static_contacts.py '
command += f'--structure {pdb_file} '
command += f'--output {self.contacts_dir + pdb_code + "_contacts.tsv"} '
command += '--itypes all' # --sele "protein"'
subprocess.run(command, shell=True)
assert os.path.isfile(self.contacts_dir + pdb_code + "_contacts.tsv")
print(f'Computed Contacts for: {pdb_code}')
def _get_protein_edges(self, pdb_code, chain_selection, contact_file):
"""
Compute protein edges
:param contact_file:
:param chain_selection:
:param pdb_code: 4 character pdb accession code
:return: edges : dataframe containing edges derived from GetContacts analysis
# todo impl covalent bond structure
"""
if not contact_file:
contact_file = self.contacts_dir + pdb_code + '_contacts' + '.tsv'
edges = set()
# Read Contacts File
with open(contact_file, 'r') as f:
next(f)
next(f)
for line in f:
linfo = line.strip().split('\t')
interaction_type = linfo[1]
# Select interacting Residues
if self.granularity == 'CA' or 'CB' or 'atom':
res1 = linfo[2]
res2 = linfo[3]
if self.granularity != 'atom':
res1 = re.search(r'.\:(.*?)\:(.*?)(?=:)', res1)[0]
res2 = re.search(r'.\:(.*?)\:(.*?)(?=:)', res2)[0]
# Add edge to set of edges
edges.add((res1, res2, interaction_type))
edges = pd.DataFrame(list(edges), columns=['res1', 'res2', 'interaction_type'])
# Remove all unallowed interactions
edges = edges.loc[edges['interaction_type'].isin(self.INTERACTION_TYPES)]
if chain_selection != 'all':
edges = edges.loc[edges['res1'].str.startswith(tuple(chain_selection))]
edges = edges.loc[edges['res2'].str.startswith(tuple(chain_selection))]
# Filter out interactions for disordered/unassigned residues
edges = edges.loc[~edges['res1'].str.contains("[A-Z]$")]
edges = edges.loc[~edges['res2'].str.contains("[A-Z]$")]
edges = edges.loc[~edges['res1'].str.contains(":0$")]
edges = edges.loc[~edges['res2'].str.contains(":0$")]
edges = edges.loc[~edges['res1'].str.contains('^X:')]
edges = edges.loc[~edges['res2'].str.contains('^X:')]
if self.long_interaction_threshold:
res1 = edges['res1'].str.extract('(\d+)').astype(int)
res2 = edges['res2'].str.extract('(\d+)').astype(int)
inds = abs(res1 - res2) > self.long_interaction_threshold
edges = edges[inds]
if self.verbose:
print(f'Calculated {len(edges)} intramolecular interaction-based edges')
return edges
def _add_protein_edges_to_graph(self, g, e):
"""
Add protein edges from dataframe of edges
:param g: Dgl graph of protein
:param e: Pandas dataframe of edges
:return: g DGL Graph with edges added
"""
if self.granularity == 'dense':
g.add_edges(
[i for i in range(g.number_of_nodes()) for j in range(g.number_of_nodes() - 1)], [
j for i in range(g.number_of_nodes())
for j in range(g.number_of_nodes()) if i != j
])
return g
else:
index = dict(zip(list(g.ndata['id']),
list(range(len(g.ndata['id'])))
))
# Remove interactions for edges between nodes not in graph. E.g hetatms
e = e.loc[e['res1'].isin(index.keys())]
e = e.loc[e['res2'].isin(index.keys())]
res1_ind = [index[res] for res in e['res1']]
res2_ind = [index[res] for res in e['res2']]
interactions = [self._onek_encoding_unk(interaction, self.INTERACTION_TYPES) for interaction in
e['interaction_type']]
g.add_edges(res1_ind, res2_ind, {'rel_type': torch.Tensor(interactions).double(),
'norm': torch.ones(len(interactions))})
return g
@staticmethod
def _onek_encoding_unk(x, allowable_set):
"""
Function for one hot encoding
:param x: value to one-hot
:param allowable_set: set of options to encode
:return: one-hot encoding as torch tensor
"""
# if x not in allowable_set:
# x = allowable_set[-1]
return [x == s for s in allowable_set]
def _get_protein_features(self, pdb_code, file_path, chain_selection):
"""
:param file_path: (str) file path to PDB file
:param pdb_code: (str) String containing four letter PDB accession
:return df (pd.DataFrame): Dataframe containing output of DSSP (Solvent accessibility, secondary structure for each residue)
"""
# Run DSSP on relevant PDB file
if pdb_code:
d = dssp_dict_from_pdb_file(self.pdb_dir + pdb_code + '.pdb')
if file_path:
d = dssp_dict_from_pdb_file(file_path)
# Parse DSSP output to DataFrame
appender = []
for k in d[1]:
to_append = []
y = d[0][k]
chain = k[0]
residue = k[1]
het = residue[0]
resnum = residue[1]
icode = residue[2]
to_append.extend([chain, resnum, icode])
to_append.extend(y)
appender.append(to_append)
cols = ['chain', 'resnum', 'icode', 'aa', 'ss', 'exposure_rsa',
'phi', 'psi', 'dssp_index', 'NH_O_1_relidx', 'NH_O_1_energy',
'O_NH_1_relidx', 'O_NH_1_energy', 'NH_O_2_relidx', 'NH_O_2_energy',
'O_NH_2_relidx', 'O_NH_2_energy']
df = pd.DataFrame.from_records(appender, columns=cols)
# Subset dataframe to those in chain_selection
if chain_selection != 'all':
df = df.loc[df['chain'].isin(chain_selection)]
# Rename cysteines to 'C'
df['aa'] = df['aa'].str.replace('[a-z]', 'C')
df = df[df['aa'].isin(list(aa1))]
# Drop alt_loc residues
df = df.loc[df['icode'] == ' ']
# Add additional Columns
df['aa_three'] = df['aa'].apply(one_to_three)
df['max_acc'] = df['aa_three'].map(residue_max_acc['Sander'].get)
df[['exposure_rsa', 'max_acc']] = df[['exposure_rsa', 'max_acc']].astype(float)
df['exposure_asa'] = df['exposure_rsa'] * df['max_acc']
df['index'] = df['chain'] + ':' + df['aa_three'] + ':' + df['resnum'].apply(str)
return df
def _compute_protein_feature_representations(self, dssp_df):
"""
:param dssp_df: (pd.DataFrame): Df containing parsed output of DSSP
:return feature_dict (dict): Dictionary of tensorized features
"""
# One hot encoded secondary structure assignments
ss_set = ['G', 'H', 'I', 'E', 'B', 'T', 'S', 'C', '-']
ss = [self._onek_encoding_unk(ss, ss_set) for ss in dssp_df['ss']]
# Create feature dictionary
feature_dict = {'ss': torch.Tensor(ss),
'asa': torch.Tensor(np.asarray(dssp_df['exposure_asa'])).reshape(len(dssp_df), 1),
'rsa': torch.Tensor(np.asarray(dssp_df['exposure_rsa'])).reshape(len(dssp_df), 1)
}
return feature_dict
@staticmethod
def _add_protein_features(g, feature_dict):
"""
Add computed protein features to graph
:param g: DGL Graph of protein.
:param feature_dict: Dictionary of features calculated by DSSP
:return: g DGL Graph of protein with SS and solvent accessibility features added to node data
"""
# 0 Pad Tensors for Proteins with HETATMS that DSSP Can't Deal with
pad_length = len(g.ndata['h']) - len(feature_dict['ss'])
if pad_length > 0:
pad = [0, 0, 0, pad_length]
feature_dict['ss'] = F.pad(feature_dict['ss'], pad, 'constant', 0)
feature_dict['asa'] = F.pad(feature_dict['asa'], pad, 'constant', 0)
feature_dict['rsa'] = F.pad(feature_dict['rsa'], pad, 'constant', 0)
# Assign Features
g.ndata['ss'] = feature_dict['ss']
g.ndata['asa'] = feature_dict['asa']
g.ndata['rsa'] = feature_dict['rsa']
return g
def _k_nn_edges(self, protein_df, k, mode='connectivity', metric='minkowski', p=2, include_self=False):
"""
Construct edges based on K nearest neighbours
:param protein_df: PandasPDB DF of protein structure
:param k: number of nearest neighbour edges for each node
:param mode: {'connectivity', 'distance'}
:param metric: {'minkowskii}
:param p:
:param include_self: bool - whether or not to include self-loops
:return:
"""
# Create distance matrix
coords = protein_df[['x_coord', 'y_coord', 'z_coord']]
# dists = pairwise_distances(np.asarray(coords))
dists = np.asarray(coords)
# Perform K-NN on coordinates
nn = kneighbors_graph(X=dists,
n_neighbors=k,
mode=mode,
metric=metric,
p=p,
include_self=include_self
)
# Create dataframe of edges
outgoing = np.repeat(np.array(range(len(coords))), k)
incoming = nn.indices
edge_df = pd.DataFrame({'res1': outgoing,
'res2': incoming,
'distance': nn.data
})
if self.long_interaction_threshold:
edge_df = edge_df.loc[abs(abs(edge_df['res1']) - abs(edge_df['res2'])) > self.long_interaction_threshold]
if self.verbose:
print(f'Calculated {len(edge_df)} K-nearest neighbour edges')
return edge_df
def _distance_based_edges(self, protein_df, cutoff):
"""
Calculate distance-based edges from coordinates in 3D structure.
Produce Edge list dataframe based on pairwise distance matrix calculation
:param protein_df: PandasPDB Dataframe
:param cutoff: Distance threshold to create an edge (Angstroms)
:return: dists : pandas dataframe of edge list and distance
"""
# Create distance matrix
coords = protein_df[['x_coord', 'y_coord', 'z_coord']]
dists = pairwise_distances(np.asarray(coords))
# Filter distance matrix and select lower triangle
dists = pd.DataFrame(np.tril(np.where(dists < cutoff, dists, 0)))
# Reshape to produce edge list
dists.values[[np.arange(len(dists))] * 2] = np.nan
dists = dists.stack().reset_index()
# Filter to remove edges that exceed cutoff
dists = dists.loc[dists[0] != 0]
if self.long_interaction_threshold:
dists = dists.loc[abs(abs(dists['level_0']) - abs(dists['level_1'])) > self.long_interaction_threshold]
if self.verbose:
print(f'Calcuclated {len(dists)} distance-based edges')
return dists
""""
@staticmethod
def get_voronoi_edges(protein_df, furthest_site=False, incremental=False):
#Calculate Voronoi edges from protein dataframe
#:param protein_df:
#:param furthest_site:
##:param incremental:
#:return:
coord = protein_df[['x_coord', 'y_coord', 'z_coord']]
vor = spatial.Voronoi(points=coord, furthest_site=furthest_site, incremental=incremental)
edges = pd.DataFrame(vor.ridge_points)
print(edges)
edges.columns = ['res1', 'res2']
print(f'Calculated {len(edges)} voronoi-ridge edges')
return edges
"""
def _get_delaunay_edges(self, protein_df, furthest_site=False, incremental=False):
"""
Calculate Delaunay edges from a dataframe of coordinates
:param protein_df:
:param furthest_site:
:param incremental:
:return:
"""
coord = protein_df[['x_coord', 'y_coord', 'z_coord']]
delaunay = spatial.Delaunay(coord, furthest_site=furthest_site, incremental=incremental)
# Turn simplices into edgelist
edges = []
indices, indptr = delaunay.vertex_neighbor_vertices
for i in range(indices.shape[0] - 1):
for j in indptr[indices[i]:indices[i + 1]]:
try:
edges.append([i, j])
except IndexError:
pass
# Create edge DataFrame
edge_df = pd.DataFrame(edges)
edge_df.columns = ['res1', 'res2']
# Get distances between edges
distances = []
for row in range(len(edge_df)):
a = coord.iloc[edge_df.iloc[row]['res1']]
b = coord.iloc[edge_df.iloc[row]['res2']]
distances.append(spatial.distance.euclidean(a, b))
edge_df['distance'] = distances
if self.long_interaction_threshold:
edge_df = edge_df.loc[abs(abs(edge_df['res1']) - abs(edge_df['res2'])) > self.long_interaction_threshold]
if self.verbose:
print(f'Calculated {len(edge_df)} Delaunay edges')
return edge_df
if __name__ == "__main__":
"""
pg = ProteinGraph(granularity='CA', insertions=False, keep_hets=True,
node_featuriser='meiler',
allowed_interactions=None,
get_contacts_path='/home/arj39/Documents/github/getcontacts',
pdb_dir='/home/arj39/Documents/test/pdb/', contacts_dir='/home/arj39/Documents/test/contacts/',
exclude_waters=True, covalent_bonds=False, include_ss=True, include_ligand=False,
edge_distance_cutoff=5
# node_featuriser=dgl.data.chem.atom_type_one_hot(),
# edge_featuriser=dgl.data.chem.bond_type_one_hot(),
# graph_constructor=dgl.data.chem.mol_to_graph())
)
"""
"""
pg = ProteinGraph(granularity='CA', insertions=False, keep_hets=True,
node_featuriser='meiler',
allowed_interactions=None,
get_contacts_path='/home/arj39/Documents/github/getcontacts',
pdb_dir='/home/arj39/Documents/test/pdb/', contacts_dir='/home/arj39/Documents/test/contacts/',
exclude_waters=True, covalent_bonds=False, include_ss=True, include_ligand=False,
edge_distance_cutoff=10
# node_featuriser=dgl.data.chem.atom_type_one_hot(),
# edge_featuriser=dgl.data.chem.bond_type_one_hot(),
# graph_constructor=dgl.data.chem.mol_to_graph())
)
"""
pg = ProteinGraph(granularity='CA', insertions=False, keep_hets=False,
node_featuriser='meiler',
intramolecular_interactions=None,
get_contacts_path='/Users/arianjamasb/github/getcontacts',
pdb_dir='../examples/pdbs/',
contacts_dir='../examples/contacts/',
exclude_waters=True,
covalent_bonds=False,
include_ss=True,
include_ligand=False,
verbose=True,
long_interaction_threshold=5,
edge_distance_cutoff=10
)
"""
g = pg.dgl_graph_from_pdb_code('3eiy',
chain_selection='all',
edge_construction=['distance', 'delaunay'], # , 'delaunay', 'k_nn'],
encoding=False,
k_nn=None)
pg.torch_geometric_graph_from_pdb_code('3eiy',
chain_selection='all',
edge_construction=['distance', 'delaunay'],
encoding=True,
k_nn=None)
"""
g, _, __ = pg.nx_graph_from_pdb_code(pdb_code='3eiy',
chain_selection='all',
edge_construction=['contacts'],
encoding=True)
#pg.make_atom_graph(pdb_code='3eiy')
# Check KNN
# g, resiude_name_encoder, residue_id_encoder = pg.nx_graph_from_pdb_code('3eiy', chain_selection='all',
# edge_construction=['distance', 'contacts'],
# encoding=True)