!pip install -q condacolab
import condacolab
condacolab.install()

!conda install -c conda-forge gemmi openbabel biopython

!pip install pdbe-arpeggio

!pdbe-arpeggio -h

usage: pdbe-arpeggio [-h] [-s SELECTION [SELECTION ...] | -sf SELECTION_FILE] [-wh] [-mh]
                     [-ms MINIMISATION_STEPS] [-mf {MMFF94,UFF,Ghemical}]
                     [-mm {DistanceGeometry,SteepestDescent,ConjugateGradients}] [-co VDW_COMP]
                     [-i INTERACTING] [-ph PH] [-sa] [-a] [-o OUTPUT] [-m]
                     filename

        ############
        # ARPEGGIO #
        ############

        A library for calculating interatomic contacts in proteins

        Dependencies:
        - Python >= 3.6
        - Numpy
        - BioPython >= v1.80
        - Open Babel >= 3.0.0
        - gemmi >= 0.5.8

        

positional arguments:
  filename              Path to the file to be analysed.

options:
  -h, --help            show this help message and exit
  -s SELECTION [SELECTION ...], --selection SELECTION [SELECTION ...]
                        Select the "ligand" for interactions, using selection syntax: /<chain_id>/<res_num>[<ins_code>]/<atom_name> or RESNAME:<het_id>. Fields can be omitted.
  -sf SELECTION_FILE, --selection-file SELECTION_FILE
                        Selections as above, but listed in a file.
  -wh, --write-hydrogenated
                        Write an output file including the added hydrogen coordinates.
  -mh, --minimise-hydrogens
                        Energy minimise OpenBabel added hydrogens.
  -ms MINIMISATION_STEPS, --minimisation-steps MINIMISATION_STEPS
                        Number of hydrogen minimisation steps to perform.
  -mf {MMFF94,UFF,Ghemical}, --minimisation-forcefield {MMFF94,UFF,Ghemical}
                        Choose the forcefield to minimise hydrogens with. Ghemical is not recommended.
  -mm {DistanceGeometry,SteepestDescent,ConjugateGradients}, --minimisation-method {DistanceGeometry,SteepestDescent,ConjugateGradients}
                        Choose the method to minimise hydrogens with. ConjugateGradients is recommended.
  -co VDW_COMP, --vdw-comp VDW_COMP
                        Compensation factor for VdW radii dependent interaction types.
  -i INTERACTING, --interacting INTERACTING
                        Distance cutoff for grid points to be 'interacting' with the entity.
  -ph PH                pH for hydrogen addition.
  -sa, --include-sequence-adjacent
                        For intra-polypeptide interactions, include non-bonding interactions between residues that are next to each other in sequence; this is not done by default.
  -a, --use-ambiguities
                        Turn on abiguous definitions for ambiguous contacts.
  -o OUTPUT, --output OUTPUT
                        Define output directory.
  -m, --mute            Silent mode without debug info.

import json
import pandas as pd

!wget http://www.rcsb.org/pdb/files/7DGD.cif.gz
!gzip -d 7DGD.cif.gz
!ls

--2023-12-13 16:59:13--  http://www.rcsb.org/pdb/files/7DGD.cif.gz
Resolving www.rcsb.org (www.rcsb.org)... 128.6.159.248
Connecting to www.rcsb.org (www.rcsb.org)|128.6.159.248|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://www.rcsb.org/pdb/files/7DGD.cif.gz [following]
--2023-12-13 16:59:13--  https://www.rcsb.org/pdb/files/7DGD.cif.gz
Connecting to www.rcsb.org (www.rcsb.org)|128.6.159.248|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.rcsb.org/download/7DGD.cif.gz [following]
--2023-12-13 16:59:13--  https://files.rcsb.org/download/7DGD.cif.gz
Resolving files.rcsb.org (files.rcsb.org)... 128.6.159.157
Connecting to files.rcsb.org (files.rcsb.org)|128.6.159.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 280261 (274K) [application/octet-stream]
Saving to: ‘7DGD.cif.gz’

7DGD.cif.gz         100%[===================>] 273.69K  --.-KB/s    in 0.07s   

2023-12-13 16:59:13 (3.61 MB/s) - ‘7DGD.cif.gz’ saved [280261/280261]

7DGD.cif  condacolab_install.log  sample_data

!mkdir ./arp-results3

!pdbe-arpeggio 7DGD.cif -ph 7.0 -wh -o ./arp-results3 -s /A//

!ls ./arp-results3

7DGD_hydrogenated.mmcif  7DGD.json

with open('./arp-results3/7DGD.json', 'r') as js_file:
  contacts = json.load(js_file)

output_dict = [x for x in contacts if (any(y in ['proximal','vdw','vdw_clash','clash'] for y in x['contact']) and (x['interacting_entities'] == 'INTRA_SELECTION') ) ] #

df = pd.json_normalize(output_dict)
df

df_protein_only = df.loc[(df['bgn.label_comp_type'] == 'P') & (df['end.label_comp_type'] == 'P')]

df_droped = df_protein_only.drop_duplicates(subset=['bgn.auth_seq_id', 'end.auth_seq_id'], keep='first')
df_droped.sort_values(by=['distance'])

def split_value(split_pos, df):
  intA = len(df[(df["bgn.auth_seq_id"] <= split_pos) & (df["end.auth_seq_id"] <= split_pos)])
  intB = len(df[(df["bgn.auth_seq_id"] > split_pos) & (df["end.auth_seq_id"] > split_pos)])
  extAB = len(df[((df["bgn.auth_seq_id"] > split_pos) & (df["end.auth_seq_id"] <= split_pos)) | ((df["bgn.auth_seq_id"] <= split_pos) & (df["end.auth_seq_id"] > split_pos))])

  if extAB != 0:
    return (intA/extAB)*(intB/extAB)
  else:
    return 0

split_data = [split_value(i+1, df_droped) for i in range(833)] #833 - длина цепи А

from matplotlib import pyplot as plt
import seaborn as sns

plt.figure(1)
sns.set_style("darkgrid")
plt.plot(range(1, 834), split_data, color='#76ee00')
plt.xlabel("Residues")
plt.ylabel("SplitValue")

Text(0, 0.5, 'SplitValue')

# приблизим график и найдём пики

plt.figure(1)
sns.set_style("darkgrid")
plt.plot(range(1, 834), split_data, color='#ee145b')
plt.xlim(510, 600)
plt.show()

first_peak = split_data.index(max(split_data[520:530]))
second_peak = split_data.index(max(split_data[570:590]))
print(f'Первый пик: {first_peak}. Второй пик: {second_peak}')

Первый пик: 522. Второй пик: 580

from IPython.display import Image

Image("domains.png", width=1000)

	contact	distance	interacting_entities	type	bgn.auth_asym_id	bgn.auth_atom_id	bgn.auth_seq_id	bgn.label_comp_id	bgn.label_comp_type	bgn.pdbx_PDB_ins_code	end.auth_asym_id	end.auth_atom_id	end.auth_seq_id	end.label_comp_id	end.label_comp_type	end.pdbx_PDB_ins_code
0	[proximal]	4.88	INTRA_SELECTION	atom-atom	A	O	695	THR	P		A	CB	697	LYS	P
1	[proximal]	4.97	INTRA_SELECTION	atom-atom	A	CA	697	LYS	P		A	O	695	THR	P
2	[proximal]	4.45	INTRA_SELECTION	atom-atom	A	CA	694	CYS	P		A	O	692	LYS	P
3	[proximal]	4.79	INTRA_SELECTION	atom-atom	A	O	692	LYS	P		A	CA	688	GLY	P
4	[proximal]	4.33	INTRA_SELECTION	atom-atom	A	N	698	PRO	P		A	CG	696	ARG	P
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
26962	[proximal]	4.78	INTRA_SELECTION	atom-atom	A	CE2	418	TYR	P		A	NZ	88	LYS	P
26963	[proximal]	4.25	INTRA_SELECTION	atom-atom	A	NZ	88	LYS	P		A	CZ	418	TYR	P
26964	[proximal, polar]	3.34	INTRA_SELECTION	atom-atom	A	OH	418	TYR	P		A	NZ	88	LYS	P
26965	[proximal]	4.67	INTRA_SELECTION	atom-atom	A	CG	88	LYS	P		A	CD2	84	HIS	P
26966	[proximal]	4.76	INTRA_SELECTION	atom-atom	A	CG	88	LYS	P		A	NE2	84	HIS	P

	contact	distance	interacting_entities	type	bgn.auth_asym_id	bgn.auth_atom_id	bgn.auth_seq_id	bgn.label_comp_id	bgn.label_comp_type	bgn.pdbx_PDB_ins_code	end.auth_asym_id	end.auth_atom_id	end.auth_seq_id	end.label_comp_id	end.label_comp_type	end.pdbx_PDB_ins_code
11897	[vdw_clash]	2.72	INTRA_SELECTION	atom-atom	A	O	537	ARG	P		A	O	541	VAL	P
6593	[vdw_clash, polar]	2.73	INTRA_SELECTION	atom-atom	A	O	664	VAL	P		A	OG	668	SER	P
20652	[vdw_clash, polar]	2.73	INTRA_SELECTION	atom-atom	A	OG	344	SER	P		A	O	408	SER	P
384	[vdw_clash, polar]	2.77	INTRA_SELECTION	atom-atom	A	N	680	ASN	P		A	O	676	VAL	P
12486	[vdw_clash]	2.77	INTRA_SELECTION	atom-atom	A	O	560	PHE	P		A	O	535	VAL	P
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
12760	[proximal]	5.00	INTRA_SELECTION	atom-atom	A	CB	550	CYS	P		A	CA	562	CYS	P
7218	[proximal]	5.00	INTRA_SELECTION	atom-atom	A	CG	651	LYS	P		A	CZ	585	TYR	P
5182	[proximal]	5.00	INTRA_SELECTION	atom-atom	A	N	722	VAL	P		A	CB	725	ILE	P
4814	[proximal]	5.00	INTRA_SELECTION	atom-atom	A	O	192	LEU	P		A	O	194	ASP	P
18242	[proximal]	5.00	INTRA_SELECTION	atom-atom	A	C	813	ILE	P		A	C	816	CYS	P

Задание 1.¶