import numpy as np
import pandas as pd
import os


files = os.listdir('DSSP')


#Вчитывание DSSP файлов
def ss_simplify(code):
    if code in ['E','H']:
        return code
    else:
        return 'C'


def parseDSSP(fnl):
    df = pd.DataFrame(columns = ['Resn', 'Resi', 'SS'])
    ind = 0
    for i in fnl:
        with open('DSSP/' + i, 'r') as fh:
            for line in fh:
                tk = line.strip().split()
                if tk[-1] == '.' or tk[0] == '#':
                    continue
                else:
                    resname = line[13]
                    if resname in ['a','b']:
                        resname = 'C'
                    elif resname in ['X','!']:
                        continue
                    else:
                        pass
                    resid = int(line[5:10])
                    sscode = line[16]
                    df.loc[ind] = [resname, resid, ss_simplify(sscode)]
                    ind += 1
    return df


all = parseDSSP(files)


#Сколько остатков приходится на каждый тип вторичной структуры
summ_SS = all.groupby(by = ['SS']).count()


#Сколько остатков каждого типа в датасете
summ_RES = all.groupby(by = ['Resn']).count()


#Сколько остатков каждого типа принимают различную вторичную структуру
summ_RES_SS = all.groupby(by = ['SS', 'Resn']).count()


#Рассчет propensity согласно формуле
s_fin = summ_RES_SS.pivot_table(index = 'SS', columns = 'Resn', values = 'Resi')
index = list(s_fin.index)
names = list(s_fin.columns)
for i in index:
    for j in names:
        s_fin.loc[i,j] = np.round(s_fin.loc[i,j] * len(all) / summ_RES.loc[j, 'Resi'] / summ_SS.loc[i, 'Resn'], 2)


#Итоговая таблица
s_fin

Resn	A	C	D	E	F	G	H	I	K	L	M	N	P	Q	R	S	T	V	W	Y
SS
C	0.77	1.28	1.51	0.85	0.78	1.56	1.17	0.64	1.03	0.67	0.74	1.41	1.71	0.77	0.88	1.12	1.06	0.69	0.83	0.80
E	0.81	1.47	0.38	0.68	1.41	0.59	0.69	1.80	0.75	1.05	1.16	0.40	0.46	1.02	1.11	0.88	1.04	1.97	1.65	1.83
H	1.47	0.26	0.69	1.46	1.03	0.46	0.96	0.96	1.13	1.45	1.27	0.82	0.34	1.32	1.10	0.91	0.89	0.78	0.79	0.71

Практикум 8. Задание 2¶