#импортируем необходимые модули
import pandas as pd
import requests, sys

import random

from numpy import transpose as tp
from numpy import log, log2

from Bio import SeqIO

import matplotlib.pyplot as plt
import seaborn as sns


#для дальнейшей работы зададим некоторые параметры
seed = 76459 
gc = 0.404 #GC-состав человека (по всем хромосомам)
psevd = 0.1 #величина псевдокаунта
proba = {'A': (1-gc)/2, 'G': gc/2, 'T': (1-gc)/2, 'C': gc/2} #ожидаемые частоты

PATH_SRC = './src/'


data = pd.read_csv(PATH_SRC+'data.tsv', sep = '\t') #файл с разметкой


#наберём 100 последовательностей с 6 нуклеотидами до ATG и 3 - после
random.seed(seed)
records = []
counter = 0
while True:
    
    i = random.choice(data.index)
    
    chrom = data.loc[i, 'chrom']
    ATG = data.loc[i, 'thickStart']
    ATG_revers = data.loc[i, 'thickEnd']
    strand = data.loc[i, 'strand']
    
    server = 'https://rest.ensembl.org'
    if strand == '+':
        ext = f'/sequence/region/human/{chrom}:{ATG-6}..{ATG+6}:1?expand_3prime=0;expand_5prime=0'
    elif strand == '-':
        ext = f'/sequence/region/human/{chrom}:{ATG_revers-5}..{ATG_revers+7}:-1?expand_3prime=0;expand_5prime=0'
    
    r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
    
    if not r.ok:
        r.raise_for_status()
        sys.exit()
        
    seq = r.text.split('\n')[1]
    
    if seq[7:10] == 'ATG':
        counter += 1
        records.append(seq)
    
    if counter % 10 == 0:
        print(f'Collected {counter} seqs')
        
    if counter == 100:
        break

Collected 10 seqs
Collected 20 seqs
Collected 20 seqs
Collected 30 seqs
Collected 30 seqs
Collected 40 seqs
Collected 40 seqs
Collected 50 seqs
Collected 60 seqs
Collected 70 seqs
Collected 80 seqs
Collected 90 seqs
Collected 100 seqs


#полученные последовательности разобьём на тренировочные (обучающие; 40) и тестовые (60)
train, test = records[60:], records[:60]
        
N = len(train)

len(records), len(train), len(test)

(100, 40, 60)


#по обучающим последовательностям строим PWM
pwm_dict = {'A': [], 'G': [], 'T': [], 'C': []}

transposed = tp(list(map(lambda x: [i for i in x], train)))
df = pd.DataFrame(transposed)

for i in df.index:
    values = df.iloc[i, :].value_counts()
    
    for k, v in proba.items():
        try:
            pwm_dict[k].append(round(log((values.loc[k] + psevd)/(N + 4*psevd)/v), 3))
        except KeyError:
            pwm_dict[k].append(round(log((0 + psevd)/(N + 4*psevd)/v), 2))
            
pwm = tp(pd.DataFrame(pwm_dict))
pwm


#для негативного контроля брались последовательности вокруг ATG из генома Sars-CoV-2
neg_df = pd.read_csv(PATH_SRC+'sarsatg.tsv', sep = '\t')

random.seed(seed)

clean_negative = neg_df[(neg_df.Strand == '+') & (neg_df.Pattern != 'start-tr')].reset_index(drop = True)
negative_random = random.choices(clean_negative.index, k = 60)

coords_atg = clean_negative.iloc[negative_random, [0, 1]].reset_index(drop = True)
coords_atg.Start = coords_atg.Start - 8
coords_atg.End = coords_atg.End + 3

iterator = SeqIO.parse(PATH_SRC+'sars.fasta', 'fasta')
genome = next(iterator)

negative_seqs = []
for i in coords_atg.index:
    negative_seqs.append(genome.seq[int(coords_atg.iloc[i, 0]):int(coords_atg.iloc[i, 1])])
    
negative = []
for i in negative_seqs:
    negative.append(sum([pwm.loc[j].iloc[k] for k, j in enumerate(i)]))


#веса обучающих последовательностей
train_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in train]
train_w
#веса тестовых последовательностей
test_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in test]
#веса последовательностей негативного контроля
neg_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in negative_seqs]

ans_table = pd.concat((pd.Series(train_w),pd.Series(test_w), pd.Series(neg_w)), axis=1)
ans_table.columns = ['train', 'test', 'neg_contr']


sns.set_theme()
fig = plt.figure()
axes = plt.subplot()
sns.histplot(data = ans_table);


#по обучающим последовательностям также была построена матрица информационного содержания
ic_dict = {'A': [], 'G': [], 'T': [], 'C': []}

transposed = tp(list(map(lambda x: [i for i in x.__str__()], train)))
df = pd.DataFrame(transposed)

for i in df.index:
    values = df.iloc[i, :].value_counts()
    
    for k, v in proba.items():
        
        try:
            ic_dict[k].append(round((values.loc[k]/N)*log2(values.loc[k]/(N*v)), 3))
        except KeyError:
            ic_dict[k].append(0)
            
ic = tp(pd.DataFrame(ic_dict))
ic

	0	1	2	3	4	5	6	7	8	9	10	11	12
A	-0.528	-0.528	-0.396	-0.280	0.005	-0.176	-0.680	1.203	-4.790	-4.790	-0.081	0.291	-1.357
G	-0.007	0.473	0.394	0.308	0.740	0.213	0.615	-4.400	-4.400	1.592	0.740	0.109	0.850
T	-0.280	-0.859	-1.357	-0.396	-0.680	-0.176	-1.077	-4.790	1.203	-4.790	-1.357	-0.528	-0.396
C	0.679	0.615	0.740	0.394	-0.470	0.213	0.615	-4.400	-4.400	-4.400	0.109	-0.007	0.213