import numpy as np
import pandas as pd
import copy
# Отображение структур
import IPython.display
import ipywidgets
from IPython.display import display,display_svg,SVG,Image
# Open Drug Discovery Toolkit
import oddt
import oddt.docking
import oddt.interactions
# Органика
from rdkit.Chem import Draw
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
Подготовка белка
import mdtraj as md
u = md.load('plus_ligand.B99990001.pdb')
pdb = u.topology
for i,r in enumerate(pdb.atoms):
print(i,r)
#посмотрим на атомы
# сохраним pdb без лиганда
u2 = u.atom_slice(range(1062))
u2.save_pdb('1lmp-clean.pdb', force_overwrite=True)
#проверим точно ли больше нет лиганда
m = md.load('1lmp-clean.pdb')
pdb_n = m.topology
for it,ra in enumerate(pdb_n.atoms):
print(it,ra)
#посмотрим на атомы - да, остался только белок
#нахождение геометрического центра лиганда
lig = u.atom_slice(range(1062,1104)) #список номера атомов лиганда
for v in lig.xyz:
lig_center = np.mean(v, axis=0)* 10
lig_center
print(lig_center)
lig.save_pdb('lig.pdb', force_overwrite=True)
Подготовка белка для докинга
#from openbabel import pybel
import pybel
prot = next(oddt.toolkit.readfile('pdb','1lmp-clean.pdb'))
prot.protein = True
#prot.addh(only_polar=True)
#prot.calccharges()
#prot.OBMol.AddPolarHydrogens()
#prot.OBMol.AutomaticPartialCharge()
print('is it the first mol in 1lmp is protein?', prot.protein,':) and MW of this mol is:', prot.molwt)
Лиганды для докинга
smiles = ["CC(=O)NC1C(O)OC(CO)C(O)C1O", # Common NAG
"OC(=O)NC1C(O)OC(CO)C(O)C1O", # -OH
"[NH3+]C(=O)NC1C(O)OC(CO)C(O)C1O", # -NH3+
"C(=O)NC1C(O)OC(CO)C(O)C1O", # -H
"C=1(C=CC=CC1)C(=O)NC1C(O)OC(CO)C(O)C1O", # -Ph
"C(C([O-])=O)(=O)NC1C(O)OC(CO)C(O)C1O"] # -COO-
mols= []
images =[]
for s in smiles:
m = oddt.toolkit.readstring('smi', s)
if not m.OBMol.Has3D():
m.make3D(forcefield='mmff94', steps=150)
### Опять всё сделают за Вас
###m.removeh()
###m.OBMol.AddPolarHydrogens()
#Chem.rdMolDescriptors.CalcMolFormula(m.Mol)
print(s, "ok")
mols.append(m)
###with print m.OBMol.Has3D() was found that:
### deep copy needed to keep 3D , write svg make mols flat
images.append((SVG(copy.deepcopy(m).write('svg'))))
display_svg(*images)
# кусок из пакета, чтобы не танцевать с бубном
import sys
import subprocess
import re
import os
import warnings
from tempfile import mkdtemp
from shutil import rmtree
from six import string_types
import oddt
from oddt.utils import (is_openbabel_molecule,
is_molecule,
check_molecule)
from oddt.spatial import rmsd
class autodock_vina(object):
def __init__(self,
protein=None,
auto_ligand=None,
size=(20, 20, 20),
center=(0, 0, 0),
exhaustiveness=8,
num_modes=9,
energy_range=3,
seed=None,
prefix_dir='/tmp',
n_cpu=1,
executable=None,
autocleanup=True,
skip_bad_mols=True):
"""Autodock Vina docking engine, which extends it's capabilities:
automatic box (auto-centering on ligand).
Parameters
----------
protein: oddt.toolkit.Molecule object (default=None)
Protein object to be used while generating descriptors.
auto_ligand: oddt.toolkit.Molecule object or string (default=None)
Ligand use to center the docking box. Either ODDT molecule or
a file (opened based on extesion and read to ODDT molecule).
Box is centered on geometric center of molecule.
size: tuple, shape=[3] (default=(20, 20, 20))
Dimentions of docking box (in Angstroms)
center: tuple, shape=[3] (default=(0,0,0))
The center of docking box in cartesian space.
exhaustiveness: int (default=8)
Exhaustiveness parameter of Autodock Vina
num_modes: int (default=9)
Number of conformations generated by Autodock Vina. The maximum
number of docked poses is 9 (due to Autodock Vina limitation).
energy_range: int (default=3)
Energy range cutoff for Autodock Vina
seed: int or None (default=None)
Random seed for Autodock Vina
prefix_dir: string (default=/tmp)
Temporary directory for Autodock Vina files
executable: string or None (default=None)
Autodock Vina executable location in the system.
It's realy necessary if autodetection fails.
autocleanup: bool (default=True)
Should the docking engine clean up after execution?
skip_bad_mols: bool (default=True)
Should molecules that crash Autodock Vina be skipped.
"""
self.dir = prefix_dir
self._tmp_dir = None
# define binding site
self.size = size
self.center = center
# center automaticaly on ligand
if auto_ligand:
if isinstance(auto_ligand, string_types):
extension = auto_ligand.split('.')[-1]
auto_ligand = next(oddt.toolkit.readfile(extension, auto_ligand))
self.center = auto_ligand.coords.mean(axis=0).round(3)
# autodetect Vina executable
if not executable:
try:
self.executable = (subprocess.check_output(['which', 'vina'])
.decode('ascii').split('\n')[0])
except subprocess.CalledProcessError:
raise Exception('Could not find Autodock Vina binary.'
'You have to install it globaly or supply binary'
'full directory via `executable` parameter.')
else:
self.executable = executable
# detect version
self.version = (subprocess.check_output([self.executable, '--version'])
.decode('ascii').split(' ')[2])
self.autocleanup = autocleanup
self.cleanup_dirs = set()
# share protein to class
self.protein = None
self.protein_file = None
if protein:
self.set_protein(protein)
self.skip_bad_mols = skip_bad_mols
self.n_cpu = n_cpu
if self.n_cpu > exhaustiveness:
warnings.warn('Exhaustiveness is lower than n_cpus, thus CPU will '
'not be saturated.')
# pregenerate common Vina parameters
self.params = []
self.params += ['--center_x', str(self.center[0]),
'--center_y', str(self.center[1]),
'--center_z', str(self.center[2])]
self.params += ['--size_x', str(self.size[0]),
'--size_y', str(self.size[1]),
'--size_z', str(self.size[2])]
self.params += ['--exhaustiveness', str(exhaustiveness)]
if seed is not None:
self.params += ['--seed', str(seed)]
if num_modes > 9 or num_modes < 1:
raise ValueError('The number of docked poses must be between 1 and 9'
' (due to Autodock Vina limitation).')
self.params += ['--num_modes', str(num_modes)]
self.params += ['--energy_range', str(energy_range)]
@property
def tmp_dir(self):
if not self._tmp_dir:
self._tmp_dir = mkdtemp(dir=self.dir, prefix='autodock_vina_')
self.cleanup_dirs.add(self._tmp_dir)
return self._tmp_dir
@tmp_dir.setter
def tmp_dir(self, value):
self._tmp_dir = value
def set_protein(self, protein):
"""Change protein to dock to.
Parameters
----------
protein: oddt.toolkit.Molecule object
Protein object to be used.
"""
# generate new directory
self._tmp_dir = None
if protein:
if isinstance(protein, string_types):
extension = protein.split('.')[-1]
if extension == 'pdbqt':
self.protein_file = protein
self.protein = next(oddt.toolkit.readfile(extension, protein))
self.protein.protein = True
else:
self.protein = next(oddt.toolkit.readfile(extension, protein))
self.protein.protein = True
else:
self.protein = protein
# skip writing if we have PDBQT protein
if self.protein_file is None:
self.protein_file = write_vina_pdbqt(self.protein, self.tmp_dir,
flexible=False)
def score(self, ligands, protein=None):
"""Automated scoring procedure.
Parameters
----------
ligands: iterable of oddt.toolkit.Molecule objects
Ligands to score
protein: oddt.toolkit.Molecule object or None
Protein object to be used. If None, then the default
one is used, else the protein is new default.
Returns
-------
ligands : array of oddt.toolkit.Molecule objects
Array of ligands (scores are stored in mol.data method)
"""
if protein:
self.set_protein(protein)
if not self.protein_file:
raise IOError("No receptor.")
if is_molecule(ligands):
ligands = [ligands]
ligand_dir = mkdtemp(dir=self.tmp_dir, prefix='ligands_')
output_array = []
for n, ligand in enumerate(ligands):
check_molecule(ligand, force_coords=True)
ligand_file = write_vina_pdbqt(ligand, ligand_dir, name_id=n)
try:
scores = parse_vina_scoring_output(
subprocess.check_output([self.executable, '--score_only',
'--receptor', self.protein_file,
'--ligand', ligand_file] + self.params,
stderr=subprocess.STDOUT))
except subprocess.CalledProcessError as e:
sys.stderr.write(e.output.decode('ascii'))
if self.skip_bad_mols:
continue
else:
raise Exception('Autodock Vina failed. Command: "%s"' %
' '.join(e.cmd))
ligand.data.update(scores)
output_array.append(ligand)
rmtree(ligand_dir)
return output_array
def dock(self, ligands, protein=None):
"""Automated docking procedure.
Parameters
----------
ligands: iterable of oddt.toolkit.Molecule objects
Ligands to dock
protein: oddt.toolkit.Molecule object or None
Protein object to be used. If None, then the default one
is used, else the protein is new default.
Returns
-------
ligands : array of oddt.toolkit.Molecule objects
Array of ligands (scores are stored in mol.data method)
"""
if protein:
self.set_protein(protein)
if not self.protein_file:
raise IOError("No receptor.")
if is_molecule(ligands):
ligands = [ligands]
ligand_dir = mkdtemp(dir=self.tmp_dir, prefix='ligands_')
output_array = []
for n, ligand in enumerate(ligands):
check_molecule(ligand, force_coords=True)
ligand_file = write_vina_pdbqt(ligand, ligand_dir, name_id=n)
ligand_outfile = ligand_file[:-6] + '_out.pdbqt'
try:
scores = parse_vina_docking_output(
subprocess.check_output([self.executable, '--receptor',
self.protein_file,
'--ligand', ligand_file,
'--out', ligand_outfile] +
self.params +
['--cpu', str(self.n_cpu)],
stderr=subprocess.STDOUT))
except subprocess.CalledProcessError as e:
sys.stderr.write(e.output.decode('ascii'))
if self.skip_bad_mols:
continue # TODO: print some warning message
else:
raise Exception('Autodock Vina failed. Command: "%s"' %
' '.join(e.cmd))
# docked conformations may have wrong connectivity - use source ligand
if is_openbabel_molecule(ligand):
if oddt.toolkits.ob.__version__ >= '2.4.0':
# find the order of PDBQT atoms assigned by OpenBabel
with open(ligand_file) as f:
write_order = [int(line[7:12].strip())
for line in f
if line[:4] == 'ATOM']
new_order = sorted(range(len(write_order)),
key=write_order.__getitem__)
new_order = [i + 1 for i in new_order] # OBMol has 1 based idx
assert len(new_order) == len(ligand.atoms)
else:
# Openbabel 2.3.2 does not support perserving atom order.
# We read back the PDBQT ligand to get "correct" bonding.
ligand = next(oddt.toolkit.readfile('pdbqt', ligand_file))
if 'REMARK' in ligand.data:
del ligand.data['REMARK']
docked_ligands = oddt.toolkit.readfile('pdbqt', ligand_outfile)
for docked_ligand, score in zip(docked_ligands, scores):
# Renumber atoms to match the input ligand
if (is_openbabel_molecule(docked_ligand) and
oddt.toolkits.ob.__version__ >= '2.4.0'):
docked_ligand.OBMol.RenumberAtoms(new_order)
# HACK: copy docked coordinates onto source ligand
# We assume that the order of atoms match between ligands
clone = ligand.clone
clone.clone_coords(docked_ligand)
clone.data.update(score)
# Calculate RMSD to the input pose
try:
clone.data['vina_rmsd_input'] = rmsd(ligand, clone)
clone.data['vina_rmsd_input_min'] = rmsd(ligand, clone,
method='min_symmetry')
except Exception:
pass
output_array.append(clone)
rmtree(ligand_dir)
return output_array
def clean(self):
for d in self.cleanup_dirs:
rmtree(d)
def predict_ligand(self, ligand):
"""Local method to score one ligand and update it's scores.
Parameters
----------
ligand: oddt.toolkit.Molecule object
Ligand to be scored
Returns
-------
ligand: oddt.toolkit.Molecule object
Scored ligand with updated scores
"""
return self.score([ligand])[0]
def predict_ligands(self, ligands):
"""Method to score ligands lazily
Parameters
----------
ligands: iterable of oddt.toolkit.Molecule objects
Ligands to be scored
Returns
-------
ligand: iterator of oddt.toolkit.Molecule objects
Scored ligands with updated scores
"""
return self.score(ligands)
def write_vina_pdbqt(mol, directory, flexible=True, name_id=None):
"""Write single PDBQT molecule to a given directory. For proteins use
`flexible=False` to avoid encoding torsions. Additionally an name ID can
be appended to a name to avoid conflicts.
"""
if name_id is None:
name_id = ''
# We expect name such as 0_ZINC123456.pdbqt or simply ZINC123456.pdbqt if no
# name_id is specified. All non alpha-numeric signs are replaced with underscore.
mol_file = ('_'.join(filter(None, [str(name_id),
re.sub('[^A-Za-z0-9]+', '_', mol.title)]
)) + '.pdbqt')
# prepend path to filename
mol_file = os.path.join(directory, mol_file)
if is_openbabel_molecule(mol):
if flexible:
# auto bonding (b), perserve atom names (n) indices (p) and Hs (h)
kwargs = {'opt': {'b': None, 'p': None, 'h': None, 'n': None}}
else:
# for proteins write rigid mol (r) and combine all frags in one (c)
kwargs = {'opt': {'r': None, 'c': None, 'h': None}}
else:
kwargs = {'flexible': flexible}
# HACK: fix OB 2.3.2 PDBQT bugs
if (not flexible and is_openbabel_molecule(mol) and
oddt.toolkits.ob.__version__ < '2.4.0'):
with open(mol_file, 'w') as f:
for line in mol.write('pdbqt', overwrite=True, **kwargs).split('\n'):
# remove OB 2.3 ROOT/ENDROOT tags
if line in ['ROOT', 'ENDROOT']:
continue
elif line[:7] == 'TORSDOF':
f.write('TER\n')
else:
f.write(line + '\n')
else:
mol.write('pdbqt', mol_file, overwrite=True, **kwargs)
return mol_file
def parse_vina_scoring_output(output):
"""Function parsing Autodock Vina scoring output to a dictionary
Parameters
----------
output : string
Autodock Vina standard ouptud (STDOUT).
Returns
-------
out : dict
dicitionary containing scores computed by Autodock Vina
"""
out = {}
r = re.compile('^(Affinity:|\s{4})')
for line in output.decode('ascii').split('\n')[13:]: # skip some output
if r.match(line):
m = line.replace(' ', '').split(':')
if m[0] == 'Affinity':
m[1] = m[1].replace('(kcal/mol)', '')
out[str('vina_' + m[0].lower())] = float(m[1])
return out
def parse_vina_docking_output(output):
"""Function parsing Autodock Vina docking output to a dictionary
Parameters
----------
output : string
Autodock Vina standard ouptud (STDOUT).
Returns
-------
out : dict
dicitionary containing scores computed by Autodock Vina
"""
out = []
r = re.compile('^\s+\d\s+')
for line in output.decode('ascii').split('\n')[13:]: # skip some output
if r.match(line):
s = line.split()
out.append({'vina_affinity': s[1],
'vina_rmsd_lb': s[2],
'vina_rmsd_ub': s[3]})
return out
# create docking object
# центром докинга дадим геометрический центр лиганда,
# найденный нами ранее - приблизительное положение сайта связывания
dock_obj= oddt.docking.AutodockVina.autodock_vina(
protein=prot,size=(20,20,20),center=lig_center,
executable='./autodock_vina_1_1_2_linux_x86/bin/vina',autocleanup=True, num_modes=9)
print(dock_obj.tmp_dir)
print(" ".join(dock_obj.params))
Параметры -
center_x center_y center_z - координаты центра;
size_x size_y size_z - ограничили докинговый объем для лиганда; = 20^3
exhaustiveness - максимальное время поиска минимума энергии; = 8
num_modes - число моделей в выдаче; 9 (в новой версии 9 - потолок)
energy_range - максимальная разница энергий между лучшей и худшей моделью
res = dock_obj.dock(mols,prot)
# отсортируем результаты по аффинности (параметр vina_affinity):
res_sorted = sorted(res, key = lambda x : float(x.data['vina_affinity']))
for i,r in enumerate(res):
print("%4d%10s%8s%8s" %(i, r.formula, r.data['vina_affinity'], r.data['vina_rmsd_ub']))
Группировка внутри блока по аффинности.
hbtotal = []
hbstrict = []
phob = []
for i,r in enumerate(res_sorted):
int1, int2, strict = oddt.interactions.hbonds(prot,r)
hbtotal.append(len(int1))
hbstrict.append(strict.sum())
ph1, ph2 = oddt.interactions.hydrophobic_contacts(prot,r)
phob.append(len(ph1))
sort_table = pd.DataFrame({'Formula':[r.formula for r in res_sorted],
'Affinity, kcal/mol':[r.data['vina_affinity'] for r in res_sorted],
'Total number of h-bonds':hbtotal, 'Strict number of h-bonds':hbstrict, 'Hydrophobic':phob,
'RMSD':[r.data['vina_rmsd_ub'] for r in res_sorted]})
Сортировка и анализ лучших
sort_table[:]
Видно, что аффинность у гидрофобики получилась лучше всех. В целом, у NAG (C8H15NO6) иногда все было неплохо с водородными, но абсолютный рекорд по этому параметру у C7H13NO7. Самая гидрофобная, как ожидалось, ароматика.
#Сделаем pdb из всех лигандов
selected = [0, 10, 18, 27, 36, 45] # выбрали лучших по группам
for i,r in enumerate(res):
if i not in selected:
continue
r.write(filename='mol%s.pdb' % i, format='pdb', overwrite=True)
До этого момента мне казалось, что все хорошо, но при визуальном анализе стало понятно, что сто-то пошло не так. Ниже приведена картинка лиганда до докинга и после.
display(Image('./9pr/lig.png'))
display(Image('./9pr/36.png'))
Видимо, что-то не так с полем. Атомы водородаостались на месте, а скелкт сместился и смялся в кучу. Я пробовал менять параметры, но лучше не становилось.