#импортируем необходимые модули
import pandas as pd
import requests, sys
import random
from numpy import transpose as tp
from numpy import log, log2
from Bio import SeqIO
import matplotlib.pyplot as plt
import seaborn as sns
#для дальнейшей работы зададим некоторые параметры
seed = 76459
gc = 0.404 #GC-состав человека (по всем хромосомам)
psevd = 0.1 #величина псевдокаунта
proba = {'A': (1-gc)/2, 'G': gc/2, 'T': (1-gc)/2, 'C': gc/2} #ожидаемые частоты
PATH_SRC = './src/'
data = pd.read_csv(PATH_SRC+'data.tsv', sep = '\t') #файл с разметкой
#наберём 100 последовательностей с 6 нуклеотидами до ATG и 3 - после
random.seed(seed)
records = []
counter = 0
while True:
i = random.choice(data.index)
chrom = data.loc[i, 'chrom']
ATG = data.loc[i, 'thickStart']
ATG_revers = data.loc[i, 'thickEnd']
strand = data.loc[i, 'strand']
server = 'https://rest.ensembl.org'
if strand == '+':
ext = f'/sequence/region/human/{chrom}:{ATG-6}..{ATG+6}:1?expand_3prime=0;expand_5prime=0'
elif strand == '-':
ext = f'/sequence/region/human/{chrom}:{ATG_revers-5}..{ATG_revers+7}:-1?expand_3prime=0;expand_5prime=0'
r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
if not r.ok:
r.raise_for_status()
sys.exit()
seq = r.text.split('\n')[1]
if seq[7:10] == 'ATG':
counter += 1
records.append(seq)
if counter % 10 == 0:
print(f'Collected {counter} seqs')
if counter == 100:
break
Collected 10 seqs Collected 20 seqs Collected 20 seqs Collected 30 seqs Collected 30 seqs Collected 40 seqs Collected 40 seqs Collected 50 seqs Collected 60 seqs Collected 70 seqs Collected 80 seqs Collected 90 seqs Collected 100 seqs
#полученные последовательности разобьём на тренировочные (обучающие; 40) и тестовые (60)
train, test = records[60:], records[:60]
N = len(train)
len(records), len(train), len(test)
(100, 40, 60)
#по обучающим последовательностям строим PWM
pwm_dict = {'A': [], 'G': [], 'T': [], 'C': []}
transposed = tp(list(map(lambda x: [i for i in x], train)))
df = pd.DataFrame(transposed)
for i in df.index:
values = df.iloc[i, :].value_counts()
for k, v in proba.items():
try:
pwm_dict[k].append(round(log((values.loc[k] + psevd)/(N + 4*psevd)/v), 3))
except KeyError:
pwm_dict[k].append(round(log((0 + psevd)/(N + 4*psevd)/v), 2))
pwm = tp(pd.DataFrame(pwm_dict))
pwm
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
A | -0.528 | -0.528 | -0.396 | -0.280 | 0.005 | -0.176 | -0.680 | 1.203 | -4.790 | -4.790 | -0.081 | 0.291 | -1.357 |
G | -0.007 | 0.473 | 0.394 | 0.308 | 0.740 | 0.213 | 0.615 | -4.400 | -4.400 | 1.592 | 0.740 | 0.109 | 0.850 |
T | -0.280 | -0.859 | -1.357 | -0.396 | -0.680 | -0.176 | -1.077 | -4.790 | 1.203 | -4.790 | -1.357 | -0.528 | -0.396 |
C | 0.679 | 0.615 | 0.740 | 0.394 | -0.470 | 0.213 | 0.615 | -4.400 | -4.400 | -4.400 | 0.109 | -0.007 | 0.213 |
#для негативного контроля брались последовательности вокруг ATG из генома Sars-CoV-2
neg_df = pd.read_csv(PATH_SRC+'sarsatg.tsv', sep = '\t')
random.seed(seed)
clean_negative = neg_df[(neg_df.Strand == '+') & (neg_df.Pattern != 'start-tr')].reset_index(drop = True)
negative_random = random.choices(clean_negative.index, k = 60)
coords_atg = clean_negative.iloc[negative_random, [0, 1]].reset_index(drop = True)
coords_atg.Start = coords_atg.Start - 8
coords_atg.End = coords_atg.End + 3
iterator = SeqIO.parse(PATH_SRC+'sars.fasta', 'fasta')
genome = next(iterator)
negative_seqs = []
for i in coords_atg.index:
negative_seqs.append(genome.seq[int(coords_atg.iloc[i, 0]):int(coords_atg.iloc[i, 1])])
negative = []
for i in negative_seqs:
negative.append(sum([pwm.loc[j].iloc[k] for k, j in enumerate(i)]))
#веса обучающих последовательностей
train_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in train]
train_w
#веса тестовых последовательностей
test_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in test]
#веса последовательностей негативного контроля
neg_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in negative_seqs]
ans_table = pd.concat((pd.Series(train_w),pd.Series(test_w), pd.Series(neg_w)), axis=1)
ans_table.columns = ['train', 'test', 'neg_contr']
sns.set_theme()
fig = plt.figure()
axes = plt.subplot()
sns.histplot(data = ans_table);
#по обучающим последовательностям также была построена матрица информационного содержания
ic_dict = {'A': [], 'G': [], 'T': [], 'C': []}
transposed = tp(list(map(lambda x: [i for i in x.__str__()], train)))
df = pd.DataFrame(transposed)
for i in df.index:
values = df.iloc[i, :].value_counts()
for k, v in proba.items():
try:
ic_dict[k].append(round((values.loc[k]/N)*log2(values.loc[k]/(N*v)), 3))
except KeyError:
ic_dict[k].append(0)
ic = tp(pd.DataFrame(ic_dict))
ic