In [52]:
import numpy as np
import pandas as pd

import copy

# Отображение структур
import IPython.display
import ipywidgets
from IPython.display import display,display_svg,SVG,Image

# Open Drug Discovery Toolkit
import oddt
import oddt.docking
import oddt.interactions

# Органика
from rdkit.Chem import Draw
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole

Подготовка белка

In [53]:
import mdtraj as md

u = md.load('plus_ligand.B99990001.pdb')
pdb = u.topology
for i,r in enumerate(pdb.atoms):
    print(i,r)
    #посмотрим на атомы
In [54]:
# сохраним pdb без лиганда
u2 = u.atom_slice(range(1062))
u2.save_pdb('1lmp-clean.pdb', force_overwrite=True)
In [55]:
#проверим точно ли больше нет лиганда
m = md.load('1lmp-clean.pdb')
pdb_n = m.topology
for it,ra in enumerate(pdb_n.atoms):
    print(it,ra)
    #посмотрим на атомы - да, остался только белок
In [56]:
#нахождение геометрического центра лиганда
lig = u.atom_slice(range(1062,1104)) #список номера атомов лиганда
for v in lig.xyz:
    lig_center = np.mean(v, axis=0)* 10
lig_center
print(lig_center)
lig.save_pdb('lig.pdb', force_overwrite=True)

Подготовка белка для докинга

In [57]:
#from openbabel import pybel
import pybel
prot = next(oddt.toolkit.readfile('pdb','1lmp-clean.pdb'))
prot.protein = True

#prot.addh(only_polar=True)
#prot.calccharges()

#prot.OBMol.AddPolarHydrogens()
#prot.OBMol.AutomaticPartialCharge()


print('is it the first mol in 1lmp is protein?', prot.protein,':) and MW of this mol is:', prot.molwt)

Лиганды для докинга

In [58]:
smiles = ["CC(=O)NC1C(O)OC(CO)C(O)C1O", # Common NAG
          "OC(=O)NC1C(O)OC(CO)C(O)C1O", # -OH
          "[NH3+]C(=O)NC1C(O)OC(CO)C(O)C1O",  # -NH3+
          "C(=O)NC1C(O)OC(CO)C(O)C1O", # -H
          "C=1(C=CC=CC1)C(=O)NC1C(O)OC(CO)C(O)C1O", # -Ph
          "C(C([O-])=O)(=O)NC1C(O)OC(CO)C(O)C1O"] # -COO-
In [59]:
mols= []
images =[]

for s in smiles:
    m = oddt.toolkit.readstring('smi', s)
    if not m.OBMol.Has3D(): 
        m.make3D(forcefield='mmff94', steps=150)
        ### Опять всё сделают за Вас
        ###m.removeh()
        ###m.OBMol.AddPolarHydrogens()
        #Chem.rdMolDescriptors.CalcMolFormula(m.Mol)
    print(s, "ok")
     
    mols.append(m)
    ###with print m.OBMol.Has3D() was found that:
    ### deep copy needed to keep 3D , write svg make mols flat
    images.append((SVG(copy.deepcopy(m).write('svg'))))
    
display_svg(*images)
- Open Babel Depiction H H H H H H H H H H H H H H H O O O O O N O
- Open Babel Depiction O H H H H H H H H H H H H H O O O O O N O
- Open Babel Depiction O H H H H H H H H H H H H O + N O O O N O H H H
- Open Babel Depiction O H H H H H H H H H H H H H O O O O N O
- Open Babel Depiction H O H H H H H H H H H H H H H H H H O N O O O O
- Open Babel Depiction O H H H H H H H H H H H H O O O O N O O _ O
mols= [] images =[] for s in smiles: m = oddt.toolkit.readstring('smi', s) if not m.OBMol.Has3D(): m.make3D(forcefield='mmff94', steps=150) m.removeh() m.OBMol.AddPolarHydrogens() print(s, "ok") mols.append(m) ###with print m.OBMol.Has3D() was found that: ### deep copy needed to keep 3D , write svg make mols flat images.append((SVG(copy.deepcopy(m).write('svg')))) display_svg(*images)
In [16]:
# кусок из пакета, чтобы не танцевать с бубном 

import sys
import subprocess
import re
import os
import warnings
from tempfile import mkdtemp
from shutil import rmtree

from six import string_types

import oddt
from oddt.utils import (is_openbabel_molecule,
                        is_molecule,
                        check_molecule)
from oddt.spatial import rmsd


class autodock_vina(object):
    def __init__(self,
                 protein=None,
                 auto_ligand=None,
                 size=(20, 20, 20),
                 center=(0, 0, 0),
                 exhaustiveness=8,
                 num_modes=9,
                 energy_range=3,
                 seed=None,
                 prefix_dir='/tmp',
                 n_cpu=1,
                 executable=None,
                 autocleanup=True,
                 skip_bad_mols=True):
        """Autodock Vina docking engine, which extends it's capabilities:
        automatic box (auto-centering on ligand).

        Parameters
        ----------
        protein: oddt.toolkit.Molecule object (default=None)
            Protein object to be used while generating descriptors.

        auto_ligand: oddt.toolkit.Molecule object or string (default=None)
            Ligand use to center the docking box. Either ODDT molecule or
            a file (opened based on extesion and read to ODDT molecule).
            Box is centered on geometric center of molecule.

        size: tuple, shape=[3] (default=(20, 20, 20))
            Dimentions of docking box (in Angstroms)

        center: tuple, shape=[3] (default=(0,0,0))
            The center of docking box in cartesian space.

        exhaustiveness: int (default=8)
            Exhaustiveness parameter of Autodock Vina

        num_modes: int (default=9)
            Number of conformations generated by Autodock Vina. The maximum
            number of docked poses is 9 (due to Autodock Vina limitation).

        energy_range: int (default=3)
            Energy range cutoff for Autodock Vina

        seed: int or None (default=None)
            Random seed for Autodock Vina

        prefix_dir: string (default=/tmp)
            Temporary directory for Autodock Vina files

        executable: string or None (default=None)
            Autodock Vina executable location in the system.
            It's realy necessary if autodetection fails.

        autocleanup: bool (default=True)
            Should the docking engine clean up after execution?

        skip_bad_mols: bool (default=True)
            Should molecules that crash Autodock Vina be skipped.
        """
        self.dir = prefix_dir
        self._tmp_dir = None
        # define binding site
        self.size = size
        self.center = center
        # center automaticaly on ligand
        if auto_ligand:
            if isinstance(auto_ligand, string_types):
                extension = auto_ligand.split('.')[-1]
                auto_ligand = next(oddt.toolkit.readfile(extension, auto_ligand))
            self.center = auto_ligand.coords.mean(axis=0).round(3)
        # autodetect Vina executable
        if not executable:
            try:
                self.executable = (subprocess.check_output(['which', 'vina'])
                                   .decode('ascii').split('\n')[0])
            except subprocess.CalledProcessError:
                raise Exception('Could not find Autodock Vina binary.'
                                'You have to install it globaly or supply binary'
                                'full directory via `executable` parameter.')
        else:
            self.executable = executable
        # detect version
        self.version = (subprocess.check_output([self.executable, '--version'])
                        .decode('ascii').split(' ')[2])
        self.autocleanup = autocleanup
        self.cleanup_dirs = set()

        # share protein to class
        self.protein = None
        self.protein_file = None
        if protein:
            self.set_protein(protein)
        self.skip_bad_mols = skip_bad_mols
        self.n_cpu = n_cpu
        if self.n_cpu > exhaustiveness:
            warnings.warn('Exhaustiveness is lower than n_cpus, thus CPU will '
                          'not be saturated.')

        # pregenerate common Vina parameters
        self.params = []
        self.params += ['--center_x', str(self.center[0]),
                        '--center_y', str(self.center[1]),
                        '--center_z', str(self.center[2])]
        self.params += ['--size_x', str(self.size[0]),
                        '--size_y', str(self.size[1]),
                        '--size_z', str(self.size[2])]
        self.params += ['--exhaustiveness', str(exhaustiveness)]
        if seed is not None:
            self.params += ['--seed', str(seed)]
        if num_modes > 9 or num_modes < 1:
            raise ValueError('The number of docked poses must be between 1 and 9'
                             ' (due to Autodock Vina limitation).')
        self.params += ['--num_modes', str(num_modes)]
        self.params += ['--energy_range', str(energy_range)]

    @property
    def tmp_dir(self):
        if not self._tmp_dir:
            self._tmp_dir = mkdtemp(dir=self.dir, prefix='autodock_vina_')
            self.cleanup_dirs.add(self._tmp_dir)
        return self._tmp_dir

    @tmp_dir.setter
    def tmp_dir(self, value):
        self._tmp_dir = value

    def set_protein(self, protein):
        """Change protein to dock to.

        Parameters
        ----------
        protein: oddt.toolkit.Molecule object
            Protein object to be used.
        """
        # generate new directory
        self._tmp_dir = None
        if protein:
            if isinstance(protein, string_types):
                extension = protein.split('.')[-1]
                if extension == 'pdbqt':
                    self.protein_file = protein
                    self.protein = next(oddt.toolkit.readfile(extension, protein))
                    self.protein.protein = True
                else:
                    self.protein = next(oddt.toolkit.readfile(extension, protein))
                    self.protein.protein = True
            else:
                self.protein = protein

            # skip writing if we have PDBQT protein
            if self.protein_file is None:
                self.protein_file = write_vina_pdbqt(self.protein, self.tmp_dir,
                                                     flexible=False)


    def score(self, ligands, protein=None):
        """Automated scoring procedure.

        Parameters
        ----------
        ligands: iterable of oddt.toolkit.Molecule objects
            Ligands to score

        protein: oddt.toolkit.Molecule object or None
            Protein object to be used. If None, then the default
            one is used, else the protein is new default.

        Returns
        -------
        ligands : array of oddt.toolkit.Molecule objects
            Array of ligands (scores are stored in mol.data method)
        """
        if protein:
            self.set_protein(protein)
        if not self.protein_file:
            raise IOError("No receptor.")
        if is_molecule(ligands):
            ligands = [ligands]
        ligand_dir = mkdtemp(dir=self.tmp_dir, prefix='ligands_')
        output_array = []
        for n, ligand in enumerate(ligands):
            check_molecule(ligand, force_coords=True)
            ligand_file = write_vina_pdbqt(ligand, ligand_dir, name_id=n)
            try:
                scores = parse_vina_scoring_output(
                    subprocess.check_output([self.executable, '--score_only',
                                             '--receptor', self.protein_file,
                                             '--ligand', ligand_file] + self.params,
                                            stderr=subprocess.STDOUT))
            except subprocess.CalledProcessError as e:
                sys.stderr.write(e.output.decode('ascii'))
                if self.skip_bad_mols:
                    continue
                else:
                    raise Exception('Autodock Vina failed. Command: "%s"' %
                                    ' '.join(e.cmd))
            ligand.data.update(scores)
            output_array.append(ligand)
        rmtree(ligand_dir)
        return output_array


    def dock(self, ligands, protein=None):
        """Automated docking procedure.

        Parameters
        ----------
        ligands: iterable of oddt.toolkit.Molecule objects
            Ligands to dock

        protein: oddt.toolkit.Molecule object or None
            Protein object to be used. If None, then the default one
            is used, else the protein is new default.

        Returns
        -------
        ligands : array of oddt.toolkit.Molecule objects
            Array of ligands (scores are stored in mol.data method)
        """
        if protein:
            self.set_protein(protein)
        if not self.protein_file:
            raise IOError("No receptor.")
        if is_molecule(ligands):
            ligands = [ligands]
        ligand_dir = mkdtemp(dir=self.tmp_dir, prefix='ligands_')
        output_array = []
        for n, ligand in enumerate(ligands):
            check_molecule(ligand, force_coords=True)
            ligand_file = write_vina_pdbqt(ligand, ligand_dir, name_id=n)
            ligand_outfile = ligand_file[:-6] + '_out.pdbqt'
            try:
                scores = parse_vina_docking_output(
                    subprocess.check_output([self.executable, '--receptor',
                                             self.protein_file,
                                             '--ligand', ligand_file,
                                             '--out', ligand_outfile] +
                                            self.params +
                                            ['--cpu', str(self.n_cpu)],
                                            stderr=subprocess.STDOUT))
            except subprocess.CalledProcessError as e:
                sys.stderr.write(e.output.decode('ascii'))
                if self.skip_bad_mols:
                    continue  # TODO: print some warning message
                else:
                    raise Exception('Autodock Vina failed. Command: "%s"' %
                                    ' '.join(e.cmd))

            # docked conformations may have wrong connectivity - use source ligand
            if is_openbabel_molecule(ligand):
                if oddt.toolkits.ob.__version__ >= '2.4.0':
                    # find the order of PDBQT atoms assigned by OpenBabel
                    with open(ligand_file) as f:
                        write_order = [int(line[7:12].strip())
                                       for line in f
                                       if line[:4] == 'ATOM']
                    new_order = sorted(range(len(write_order)),
                                       key=write_order.__getitem__)
                    new_order = [i + 1 for i in new_order]  # OBMol has 1 based idx

                    assert len(new_order) == len(ligand.atoms)
                else:
                    # Openbabel 2.3.2 does not support perserving atom order.
                    # We read back the PDBQT ligand to get "correct" bonding.
                    ligand = next(oddt.toolkit.readfile('pdbqt', ligand_file))
                    if 'REMARK' in ligand.data:
                        del ligand.data['REMARK']

            docked_ligands = oddt.toolkit.readfile('pdbqt', ligand_outfile)
            for docked_ligand, score in zip(docked_ligands, scores):
                # Renumber atoms to match the input ligand
                if (is_openbabel_molecule(docked_ligand) and
                        oddt.toolkits.ob.__version__ >= '2.4.0'):
                    docked_ligand.OBMol.RenumberAtoms(new_order)
                # HACK: copy docked coordinates onto source ligand
                # We assume that the order of atoms match between ligands
                clone = ligand.clone
                clone.clone_coords(docked_ligand)
                clone.data.update(score)

                # Calculate RMSD to the input pose
                try:
                    clone.data['vina_rmsd_input'] = rmsd(ligand, clone)
                    clone.data['vina_rmsd_input_min'] = rmsd(ligand, clone,
                                                             method='min_symmetry')
                except Exception:
                    pass
                output_array.append(clone)
        rmtree(ligand_dir)
        return output_array


    def clean(self):
        for d in self.cleanup_dirs:
            rmtree(d)


    def predict_ligand(self, ligand):
        """Local method to score one ligand and update it's scores.

        Parameters
        ----------
        ligand: oddt.toolkit.Molecule object
            Ligand to be scored

        Returns
        -------
        ligand: oddt.toolkit.Molecule object
            Scored ligand with updated scores
        """
        return self.score([ligand])[0]


    def predict_ligands(self, ligands):
        """Method to score ligands lazily

        Parameters
        ----------
        ligands: iterable of oddt.toolkit.Molecule objects
            Ligands to be scored

        Returns
        -------
        ligand: iterator of oddt.toolkit.Molecule objects
            Scored ligands with updated scores
        """
        return self.score(ligands)



def write_vina_pdbqt(mol, directory, flexible=True, name_id=None):
    """Write single PDBQT molecule to a given directory. For proteins use
    `flexible=False` to avoid encoding torsions. Additionally an name ID can
    be appended to a name to avoid conflicts.
    """
    if name_id is None:
        name_id = ''

    # We expect name such as 0_ZINC123456.pdbqt or simply ZINC123456.pdbqt if no
    # name_id is specified. All non alpha-numeric signs are replaced with underscore.
    mol_file = ('_'.join(filter(None, [str(name_id),
                                       re.sub('[^A-Za-z0-9]+', '_', mol.title)]
                                )) + '.pdbqt')
    # prepend path to filename
    mol_file = os.path.join(directory, mol_file)

    if is_openbabel_molecule(mol):
        if flexible:
            # auto bonding (b), perserve atom names (n) indices (p) and Hs (h)
            kwargs = {'opt': {'b': None, 'p': None, 'h': None, 'n': None}}
        else:
            # for proteins write rigid mol (r) and combine all frags in one (c)
            kwargs = {'opt': {'r': None, 'c': None, 'h': None}}

    else:
        kwargs = {'flexible': flexible}

    # HACK: fix OB 2.3.2 PDBQT bugs
    if (not flexible and is_openbabel_molecule(mol) and
            oddt.toolkits.ob.__version__ < '2.4.0'):
        with open(mol_file, 'w') as f:
            for line in mol.write('pdbqt', overwrite=True, **kwargs).split('\n'):
                # remove OB 2.3 ROOT/ENDROOT tags
                if line in ['ROOT', 'ENDROOT']:
                    continue
                elif line[:7] == 'TORSDOF':
                    f.write('TER\n')
                else:
                    f.write(line + '\n')
    else:
        mol.write('pdbqt', mol_file, overwrite=True, **kwargs)
    return mol_file



def parse_vina_scoring_output(output):
    """Function parsing Autodock Vina scoring output to a dictionary

    Parameters
    ----------
    output : string
        Autodock Vina standard ouptud (STDOUT).

    Returns
    -------
    out : dict
        dicitionary containing scores computed by Autodock Vina
    """
    out = {}
    r = re.compile('^(Affinity:|\s{4})')
    for line in output.decode('ascii').split('\n')[13:]:  # skip some output
        if r.match(line):
            m = line.replace(' ', '').split(':')
            if m[0] == 'Affinity':
                m[1] = m[1].replace('(kcal/mol)', '')
            out[str('vina_' + m[0].lower())] = float(m[1])
    return out



def parse_vina_docking_output(output):
    """Function parsing Autodock Vina docking output to a dictionary

    Parameters
    ----------
    output : string
        Autodock Vina standard ouptud (STDOUT).

    Returns
    -------
    out : dict
        dicitionary containing scores computed by Autodock Vina
    """
    out = []
    r = re.compile('^\s+\d\s+')
    for line in output.decode('ascii').split('\n')[13:]:  # skip some output
        if r.match(line):
            s = line.split()
            out.append({'vina_affinity': s[1],
                        'vina_rmsd_lb': s[2],
                        'vina_rmsd_ub': s[3]})
    return out
In [60]:
# create docking object
# центром докинга дадим геометрический центр лиганда,
# найденный нами ранее - приблизительное положение сайта связывания
dock_obj= oddt.docking.AutodockVina.autodock_vina(
    protein=prot,size=(20,20,20),center=lig_center,
    executable='./autodock_vina_1_1_2_linux_x86/bin/vina',autocleanup=True, num_modes=9)

print(dock_obj.tmp_dir)
print(" ".join(dock_obj.params))

Параметры -

center_x center_y center_z - координаты центра;

size_x size_y size_z - ограничили докинговый объем для лиганда; = 20^3

exhaustiveness - максимальное время поиска минимума энергии; = 8

num_modes - число моделей в выдаче; 9 (в новой версии 9 - потолок)

energy_range - максимальная разница энергий между лучшей и худшей моделью

In [61]:
res = dock_obj.dock(mols,prot)
In [62]:
# отсортируем результаты по аффинности (параметр vina_affinity):
res_sorted = sorted(res, key = lambda x : float(x.data['vina_affinity']))
In [63]:
for i,r in enumerate(res):
    print("%4d%10s%8s%8s" %(i, r.formula, r.data['vina_affinity'],  r.data['vina_rmsd_ub']))

Группировка внутри блока по аффинности.

In [64]:
hbtotal = []
hbstrict = []
phob = []
for i,r in enumerate(res_sorted):
    int1, int2, strict = oddt.interactions.hbonds(prot,r)
    hbtotal.append(len(int1))
    hbstrict.append(strict.sum())
    ph1, ph2 = oddt.interactions.hydrophobic_contacts(prot,r)
    phob.append(len(ph1))
sort_table = pd.DataFrame({'Formula':[r.formula for r in res_sorted],
                   'Affinity, kcal/mol':[r.data['vina_affinity'] for r in res_sorted],
                   'Total number of h-bonds':hbtotal, 'Strict number of h-bonds':hbstrict, 'Hydrophobic':phob, 
                    'RMSD':[r.data['vina_rmsd_ub'] for r in res_sorted]})

Сортировка и анализ лучших

In [65]:
sort_table[:]
Out[65]:
Formula Affinity, kcal/mol Total number of h-bonds Strict number of h-bonds Hydrophobic RMSD
0 C13H17NO6 -6.3 4 3 0 0.000
1 C13H17NO6 -6.2 3 3 2 6.726
2 C13H17NO6 -6.1 7 7 2 6.590
3 C13H17NO6 -6.0 6 6 2 7.001
4 C13H17NO6 -5.9 1 1 1 6.651
5 C13H17NO6 -5.9 5 4 3 7.151
6 C7H13NO7 -5.8 6 5 0 0.000
7 C7H15N2O6+ -5.8 4 4 0 0.000
8 C7H13NO7 -5.7 4 3 0 9.330
9 C7H15N2O6+ -5.7 3 2 0 9.350
10 C13H17NO6 -5.7 2 2 5 7.530
11 C13H17NO6 -5.7 3 3 4 7.271
12 C7H13NO7 -5.6 6 6 0 9.573
13 C13H17NO6 -5.6 2 2 4 6.873
14 C8H15NO6 -5.5 4 4 1 0.000
15 C7H15N2O6+ -5.5 3 3 0 10.094
16 C7H15N2O6+ -5.3 8 6 0 5.519
17 C7H13NO6 -5.3 3 3 0 0.000
18 C8H12NO8- -5.3 7 7 0 0.000
19 C8H12NO8- -5.3 4 4 0 5.847
20 C8H15NO6 -5.2 4 3 2 8.914
21 C7H13NO7 -5.2 10 8 0 5.576
22 C7H15N2O6+ -5.2 4 4 0 9.268
23 C7H15N2O6+ -5.2 8 7 0 3.696
24 C7H15N2O6+ -5.2 3 3 0 4.806
25 C7H15N2O6+ -5.2 4 4 0 4.973
26 C8H12NO8- -5.2 3 3 0 5.386
27 C8H15NO6 -5.1 6 6 1 6.116
28 C7H13NO6 -5.1 4 4 0 3.619
29 C8H12NO8- -5.1 1 0 0 8.573
30 C8H12NO8- -5.1 2 2 0 3.828
31 C8H15NO6 -5.0 6 5 2 7.601
32 C8H15NO6 -5.0 3 2 0 9.046
33 C8H15NO6 -5.0 8 6 1 9.284
34 C8H15NO6 -5.0 5 4 0 3.975
35 C7H13NO7 -5.0 7 7 0 5.079
36 C7H13NO7 -5.0 4 4 0 9.032
37 C7H15N2O6+ -5.0 6 3 0 4.929
38 C7H13NO6 -5.0 5 5 0 4.430
39 C7H13NO6 -5.0 5 5 0 6.088
40 C8H12NO8- -5.0 7 4 0 3.568
41 C8H12NO8- -5.0 4 4 0 5.254
42 C8H15NO6 -4.9 3 3 0 4.931
43 C7H13NO7 -4.9 9 8 0 8.780
44 C7H13NO7 -4.9 6 5 0 7.546
45 C7H13NO7 -4.9 6 4 0 3.490
46 C7H13NO6 -4.9 2 1 0 9.315
47 C8H12NO8- -4.9 4 4 0 8.125
48 C8H15NO6 -4.8 6 5 0 10.477
49 C7H13NO6 -4.8 7 5 0 7.581
50 C7H13NO6 -4.8 8 5 0 9.443
51 C8H12NO8- -4.8 0 0 0 5.807
52 C7H13NO6 -4.7 4 4 0 4.538
53 C7H13NO6 -4.7 6 4 0 9.157

Видно, что аффинность у гидрофобики получилась лучше всех. В целом, у NAG (C8H15NO6) иногда все было неплохо с водородными, но абсолютный рекорд по этому параметру у C7H13NO7. Самая гидрофобная, как ожидалось, ароматика.

In [67]:
#Сделаем pdb из всех лигандов
selected = [0, 10, 18, 27, 36, 45] # выбрали лучших по группам
for i,r in enumerate(res):
    if i not in selected:
        continue

    r.write(filename='mol%s.pdb' % i, format='pdb', overwrite=True) 

До этого момента мне казалось, что все хорошо, но при визуальном анализе стало понятно, что сто-то пошло не так. Ниже приведена картинка лиганда до докинга и после.

In [42]:
display(Image('./9pr/lig.png'))
In [44]:
display(Image('./9pr/36.png'))

Видимо, что-то не так с полем. Атомы водородаостались на месте, а скелкт сместился и смялся в кучу. Я пробовал менять параметры, но лучше не становилось.