!pip install -q biopython


import numpy as np
import pandas as pd
import sympy as sy
import seaborn as sns
import matplotlib.pyplot as plt
import requests, sys
import random
from tqdm.auto import tqdm
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split
import re

from Bio import Entrez
from Bio import SeqIO

import os


!wget 'https://kodomo.fbb.msu.ru/FBB/year_20/term4/human-genes.tsv'


genes = pd.read_csv('human-genes.tsv', delimiter='\t')
genes = pd.DataFrame(genes.loc[genes['strand'] == '+'])
genes['chrom'] = genes['#chrom'].apply(lambda x: x[-1])


genes.head()


# Максимально быстрый и максимально нечитаемый генератор ссылок
links = (pd.Series(
    ['https://rest.ensembl.org/sequence/region/human/'] *
    len(genes)) + genes.chrom + pd.Series(
        [':'] * len(genes)) + genes.thickStart.astype('int').apply(
            lambda x: x - 6).astype('string') + pd.Series(
                ['..'] * len(genes)) + genes.thickStart.astype('int').apply(
                    lambda x: x + 6).astype('string') + pd.Series(
                        [':1?expand_3prime=0;expand_5prime=0'] * len(genes))).dropna()


tqdm_notebook(tqdm.pandas())  # Без этого костыля прогресс-бар не работает
sample_links = links.sample(frac=0.25)  # Иначе слишком долго грузится
reqs = sample_links.progress_apply(
    lambda link: requests.get(link, headers={'Content-Type': 'text/x-fasta'}))

/var/folders/mc/gtx8k44d7dg3kj5dr714tqgr0000gn/T/ipykernel_30380/1785198480.py:1: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm_notebook(tqdm.pandas())  # Без этого костыля прогресс-бар не работает

0it [00:00, ?it/s]

  0%|          | 0/2162 [00:00<?, ?it/s]


reqs = reqs.loc[reqs.apply(lambda req: req.text[-7:-4] == 'ATG')]
seqs = reqs.apply(lambda x: x.text[-14:-1])


seqs = seqs.to_list()


neg_links = (pd.Series(
    ['https://rest.ensembl.org/sequence/region/human/'] *
    len(genes)) + genes.chrom + pd.Series(
        [':'] * len(genes)) + genes.thickStart.astype('string') + pd.Series(
                ['..'] * len(genes)) + genes.thickEnd.astype('string') + pd.Series(
                        [':1?expand_3prime=0;expand_5prime=0'] * len(genes))).dropna()


tqdm_notebook(tqdm.pandas())
sample_negs = neg_links.sample(frac=0.25)
neg_reqs = sample_negs.progress_apply(
    lambda link: requests.get(link, headers={'Content-Type': 'text/x-fasta'}))

/var/folders/mc/gtx8k44d7dg3kj5dr714tqgr0000gn/T/ipykernel_30380/3787484836.py:1: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tqdm_notebook(tqdm.pandas())

0it [00:00, ?it/s]

  0%|          | 0/2162 [00:00<?, ?it/s]


all_negs = []
for _, req in tqdm(neg_reqs.items(), total=len(neg_reqs)):
    seq = ''.join(req.text.split()[1:])
    subseqs = [x.group(0) for x in re.finditer(r'\w{7}ATG\w{3}', seq)]
    all_negs += subseqs

  0%|          | 0/2162 [00:00<?, ?it/s]


class PWM:
    def __init__(self, alignment, gc_cont, pseudocount=0.01):

        cont_dee = {'A': (1 - gc_cont)/2,
                    'T': (1 - gc_cont)/2,
                    'G': gc_cont/2,
                    'C': gc_cont/2}

        matrix = []

        for i in range(len(alignment[0])):
            freqs = {'A': 0, 'T': 0, 'G': 0, 'C': 0}

            for seq in alignment:
                freqs[seq[i]] += 1

            for key in freqs:
                freqs[key] = np.log((freqs[key] + pseudocount)/cont_dee[key]/len(alignment))

            matrix.append(list(freqs.values()))

        matrix = np.array(matrix).T

        df = pd.DataFrame({key: matrix[chi] for chi, key in enumerate(['A', 'T', 'G', 'C'])}).T
        df.columns = np.arange(1, len(alignment[0]) + 1)

        self.df = df

        self.min_result = sum(min(col) for col in matrix)
        self.max_result = sum(max(col) for col in matrix)

    def weights(self):
        return self.df

    def predict(self, seq):
        outp = 0
        for j, liter in enumerate(seq):
            outp += self.df.iloc[:, j][liter]
            proba = (outp - self.min_result)/(self.max_result - self.min_result)
        return proba


train_starts, test_starts = train_test_split(seqs, test_size=0.4)


pwm = PWM(train_starts, 0.444)
pwm.weights()


sns.heatmap(pwm.weights(), cmap='mako')
plt.plot()

[]


positive_scores = [pwm.predict(pos_seq) for pos_seq in tqdm(test_starts)]
negative_scores = [pwm.predict(neg_seq) for neg_seq in tqdm(all_negs)]

np.mean(positive_scores), np.mean(negative_scores)

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/1059864 [00:00<?, ?it/s]

(0.9555819272337526, 0.5591216367908287)

	#chrom	chromStart	chromEnd	name	strand	len	thickStart	thickEnd	reserved	len-thick	...	type	geneName	geneName2	transcriptClass	source	transcriptType	tag	level	tier	chrom
0	chr1	65418	71585	ENST00000641515.2	+	6168	65564	70008	789624	4445	...	none	OR4F5	A0A2U3U0J3	coding	havana_homo_sapiens	protein_coding	Ensembl_canonical,MANE_Select,RNA_Seq_supporte...	2	canonical,basic,all	1
3	chr1	923922	944574	ENST00000616016.5	+	20653	924431	944153	789624	19723	...	none	SAMD11	I7G285	coding	havana_homo_sapiens	protein_coding	CAGE_supported_TSS,Ensembl_canonical,MANE_Sele...	2	basic,all	1
4	chr1	925149	935793	ENST00000437963.5	+	10645	925941	935793	789624	9853	...	none	SAMD11	Q5SV95	coding	havana_homo_sapiens	protein_coding	alternative_5_UTR,cds_end_NF,mRNA_end_NF	2	all	1
5	chr1	930311	944575	ENST00000341065.8	+	14265	930311	944153	789624	13843	...	none	SAMD11	H7BY14	coding	havana_homo_sapiens	protein_coding	cds_start_NF,mRNA_start_NF	2	all	1
6	chr1	939274	944259	ENST00000455979.1	+	4986	939274	944153	789624	4880	...	none	SAMD11	H7C3J6	coding	havana_homo_sapiens	protein_coding	cds_start_NF,mRNA_start_NF	2	all	1

	1	2	3	4	5	6	7	8	9	10	11	12	13
A	-0.410149	-0.173807	-0.246553	0.017217	0.446499	-0.105996	-0.265598	1.280175	-8.822204	-8.822204	-0.388175	0.059771	-0.388175
T	-0.073741	-0.527905	-0.432617	-0.690379	-1.272069	-0.410149	-0.997758	-8.822204	1.280175	-8.822204	-0.227865	-0.366674	-0.089738
G	0.051136	0.390061	0.197716	-0.021610	0.377484	0.135205	0.085617	-8.597261	-8.597261	1.505119	0.752929	-0.141730	0.414751
C	0.377484	0.212752	0.402482	0.485360	-0.302961	0.338775	0.690133	-8.597261	-8.597261	-8.597261	-0.624450	0.364747	-0.002921

Описание сигнала¶

Построение PWM для последовательности Kozak Homo sapiens¶

Позитивная выборка¶

Негативная выборка¶

Теперь создадим класс с PWM¶

Обучение и тестирование¶