import numpy as np
import pandas as pd
import os
files = os.listdir('DSSP')
#Вчитывание DSSP файлов
def ss_simplify(code):
if code in ['E','H']:
return code
else:
return 'C'
def parseDSSP(fnl):
df = pd.DataFrame(columns = ['Resn', 'Resi', 'SS'])
ind = 0
for i in fnl:
with open('DSSP/' + i, 'r') as fh:
for line in fh:
tk = line.strip().split()
if tk[-1] == '.' or tk[0] == '#':
continue
else:
resname = line[13]
if resname in ['a','b']:
resname = 'C'
elif resname in ['X','!']:
continue
else:
pass
resid = int(line[5:10])
sscode = line[16]
df.loc[ind] = [resname, resid, ss_simplify(sscode)]
ind += 1
return df
all = parseDSSP(files)
#Сколько остатков приходится на каждый тип вторичной структуры
summ_SS = all.groupby(by = ['SS']).count()
#Сколько остатков каждого типа в датасете
summ_RES = all.groupby(by = ['Resn']).count()
#Сколько остатков каждого типа принимают различную вторичную структуру
summ_RES_SS = all.groupby(by = ['SS', 'Resn']).count()
#Рассчет propensity согласно формуле
s_fin = summ_RES_SS.pivot_table(index = 'SS', columns = 'Resn', values = 'Resi')
index = list(s_fin.index)
names = list(s_fin.columns)
for i in index:
for j in names:
s_fin.loc[i,j] = np.round(s_fin.loc[i,j] * len(all) / summ_RES.loc[j, 'Resi'] / summ_SS.loc[i, 'Resn'], 2)
#Итоговая таблица
s_fin
Resn | A | C | D | E | F | G | H | I | K | L | M | N | P | Q | R | S | T | V | W | Y |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
SS | ||||||||||||||||||||
C | 0.77 | 1.28 | 1.51 | 0.85 | 0.78 | 1.56 | 1.17 | 0.64 | 1.03 | 0.67 | 0.74 | 1.41 | 1.71 | 0.77 | 0.88 | 1.12 | 1.06 | 0.69 | 0.83 | 0.80 |
E | 0.81 | 1.47 | 0.38 | 0.68 | 1.41 | 0.59 | 0.69 | 1.80 | 0.75 | 1.05 | 1.16 | 0.40 | 0.46 | 1.02 | 1.11 | 0.88 | 1.04 | 1.97 | 1.65 | 1.83 |
H | 1.47 | 0.26 | 0.69 | 1.46 | 1.03 | 0.46 | 0.96 | 0.96 | 1.13 | 1.45 | 1.27 | 0.82 | 0.34 | 1.32 | 1.10 | 0.91 | 0.89 | 0.78 | 0.79 | 0.71 |