Задание 2¶

In [1]:
import requests
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
/home/bush29/anaconda3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.2
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
In [2]:
np.random.seed(100500)
human_genes = pd.read_csv('human-genes.tsv',sep='\t')
genes = human_genes.sample(300, replace=False)
In [3]:
lst_seq = []
server = 'https://rest.ensembl.org'
counter = 0
for row in tqdm(genes.itertuples()):
    strand = 1 if row.strand == '+' else -1
    coord = row.thickStart if strand == 1 else row.thickEnd
    
    if strand == 1:
        ext = f"/sequence/region/human/{row._1}:{coord-6}..{coord+6}:{strand}?expand_3prime=0;expand_5prime=0" 
    else:
        ext = f"/sequence/region/human/{row._1}:{coord-5}..{coord+7}:{strand}?expand_3prime=0;expand_5prime=0"
    
    r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
    seq = r.text.split('\n')[1]
    
    cond = ('ATG' == seq[7:10]) 
    lst_seq.append(seq) if cond else None
    counter += 1  if cond else 0
    
    if counter == 100:
        break
0it [00:00, ?it/s]
In [4]:
np.random.seed(100500)

np.random.shuffle(lst_seq)
x_train = lst_seq[:40]
x_test = lst_seq[40:]
len(x_test)
Out[4]:
60
In [5]:
gc_content = 0.41 #according to wikipedia XD
expect_dict = {'A':(1-gc_content)/2, 'T':(1-gc_content)/2, 'G':gc_content/2, 'C':gc_content/2 }
assert sum(expect_dict.values())
In [6]:
my_matrix = pd.DataFrame([list(i) for i in x_train])
lst_series = []

for col_name in my_matrix.columns:
    col = my_matrix.loc[:,col_name]
    my_counts = pd.value_counts(col).sort_index()
    lst_series.append(my_counts)
my_count_matrix = pd.concat(lst_series, axis=1).fillna(0)    

my_prop_matrix = my_count_matrix.apply(lambda x: (x+0.1)/(sum(x.values)+0.4) , axis=0)
pwm = pd.concat([pd.Series(map(lambda x : np.log2((x)/(expect_dict[row.Index])), row[1:])) for row in my_prop_matrix.itertuples()], axis=1).T
pwm.index = my_prop_matrix.index
pwm
Out[6]:
0 1 2 3 4 5 6 7 8 9 10 11 12
A 0.136425 -0.557148 -0.557148 -0.238787 0.954751 -0.102582 -0.747251 1.750460 -6.896998 -6.896998 -0.238787 -0.238787 -1.539446
C -0.032057 0.422509 0.661516 1.045945 -2.912476 0.959010 1.205522 -6.371907 -6.371907 -6.371907 -1.979590 1.045945 0.959010
G -0.032057 0.959010 0.546956 0.422509 0.767644 -0.222160 0.286304 -6.371907 -6.371907 2.275551 1.479842 0.135887 0.767644
T -0.102582 -1.224573 -0.747251 -2.504681 -2.504681 -0.966261 -1.539446 -6.896998 1.750460 -6.896998 -1.224573 -1.539446 -0.966261
In [7]:
np.random.seed(100500)

atg_sars = pd.read_csv('./sarsatg.tsv',sep='\t')
drop_lst = atg_sars.loc[atg_sars['Pattern'] == 'start-tr', 'Start']
good_lst = atg_sars.loc[ (~atg_sars['Start'].isin(drop_lst)&(atg_sars['Strand']=='+')),:]
with open('./sars.fasta', 'r') as file1:
    genome = ''.join([i[:-1] for i in file1.readlines() if i[0]!='>'])

sample = good_lst.sample(60, replace=False)
x_neg = [genome[int(row.Start-8):int(row.End+3)] for row in sample.itertuples()]
In [8]:
#Train_weights
train_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in x_train]
train_w
#Test_weights
test_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in x_test]
#Neg_contr_weights
neg_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in x_neg]

ans_table = pd.concat((pd.Series(train_w),pd.Series(test_w), pd.Series(neg_w)), axis=1)
ans_table.columns = ['train', 'test', 'neg_contr']

Время графиков, написанных в 2 строки!!!(4)¶

In [9]:
sns.set_theme()
fig = plt.figure(figsize=(8,6))
axes = plt.subplot()
sns.histplot(data = ans_table)
Out[9]:
<AxesSubplot: ylabel='Count'>
In [10]:
sns.set_theme()
fig = plt.figure(figsize=(8,6))
axes = plt.subplot()
sns.boxplot(data = ans_table)
Out[10]:
<AxesSubplot: >

Задание 3¶

In [11]:
new_prop = my_count_matrix.apply(lambda x: (x)/(sum(x.values)) , axis=0)
new_pwm = pd.concat([pd.Series(map(lambda x : np.log2((x)/(expect_dict[row.Index])), row[1:])) for row in new_prop.itertuples()], axis=1).T
new_pwm[new_pwm == -(np.inf)] = 0
new_pwm.index = new_prop.index
IC_matrix = new_prop * new_pwm # наверное, я не уверен)
IC_matrix.loc['IC(j)',:] = IC_matrix.sum(axis=0)
IC_matrix
/tmp/ipykernel_6483/1174954439.py:2: RuntimeWarning: divide by zero encountered in log2
  new_pwm = pd.concat([pd.Series(map(lambda x : np.log2((x)/(expect_dict[row.Index])), row[1:])) for row in new_prop.itertuples()], axis=1).T
Out[11]:
0 1 2 3 4 5 6 7 8 9 10 11 12
A 0.045411 -0.112143 -0.112143 -0.059697 0.553637 -0.027853 -0.131838 1.761213 0.000000 0.000000 -0.059697 -0.059697 -0.156071
C -0.007125 0.116547 0.216065 0.447032 -0.075891 0.385750 0.575844 0.000000 0.000000 0.000000 -0.101781 0.447032 0.385750
G -0.007125 0.385750 0.164802 0.116547 0.270106 -0.039947 0.071576 0.000000 0.000000 2.286304 0.855564 0.030218 0.270106
T -0.027853 -0.154848 -0.131838 -0.128036 -0.128036 -0.146363 -0.156071 0.000000 1.761213 0.000000 -0.154848 -0.156071 -0.146363
IC(j) 0.003308 0.235306 0.136886 0.375846 0.619817 0.171588 0.359511 1.761213 1.761213 2.286304 0.539238 0.261481 0.353422

Задание 4¶

In [12]:
from collections import defaultdict, Counter
from statsmodels.stats.proportion import proportions_ztest
with open('./sequence.fasta', 'r') as file1:
    genome = ''.join([i[:-1] for i in file1.readlines() if i[0]!='>'])
coli_exp = {'A':(1-0.508)/2, 'T':(1-0.508)/2, 'G':(0.508)/2, 'C':(0.508)/2 }    
dct = defaultdict(int)
k_mers = [genome[i:i+6] for i in range(len(genome))]
counts = Counter(k_mers)
ans_count = counts['GAATTC']
ans_prop = ans_count/sum(counts.values())
expected_prop = np.prod(np.array([coli_exp[i] for i in 'GAATTC']))
expected_prop
Out[12]:
0.00023626960849209598
In [13]:
score, pval = proportions_ztest(count=ans_count, nobs=sum(counts.values()),value=expected_prop,  alternative='two-sided', prop_var = expected_prop)
pval # It's a literally zero
Out[13]:
4.5837535844254235e-42
In [14]:
!jupyter nbconvert --to html pr6.ipynb