import requests
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

/home/bush29/anaconda3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.2
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"


np.random.seed(100500)
human_genes = pd.read_csv('human-genes.tsv',sep='\t')
genes = human_genes.sample(300, replace=False)


lst_seq = []
server = 'https://rest.ensembl.org'
counter = 0
for row in tqdm(genes.itertuples()):
    strand = 1 if row.strand == '+' else -1
    coord = row.thickStart if strand == 1 else row.thickEnd
    
    if strand == 1:
        ext = f"/sequence/region/human/{row._1}:{coord-6}..{coord+6}:{strand}?expand_3prime=0;expand_5prime=0" 
    else:
        ext = f"/sequence/region/human/{row._1}:{coord-5}..{coord+7}:{strand}?expand_3prime=0;expand_5prime=0"
    
    r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
    seq = r.text.split('\n')[1]
    
    cond = ('ATG' == seq[7:10]) 
    lst_seq.append(seq) if cond else None
    counter += 1  if cond else 0
    
    if counter == 100:
        break

0it [00:00, ?it/s]


np.random.seed(100500)

np.random.shuffle(lst_seq)
x_train = lst_seq[:40]
x_test = lst_seq[40:]
len(x_test)

60


gc_content = 0.41 #according to wikipedia XD
expect_dict = {'A':(1-gc_content)/2, 'T':(1-gc_content)/2, 'G':gc_content/2, 'C':gc_content/2 }
assert sum(expect_dict.values())


my_matrix = pd.DataFrame([list(i) for i in x_train])
lst_series = []

for col_name in my_matrix.columns:
    col = my_matrix.loc[:,col_name]
    my_counts = pd.value_counts(col).sort_index()
    lst_series.append(my_counts)
my_count_matrix = pd.concat(lst_series, axis=1).fillna(0)    

my_prop_matrix = my_count_matrix.apply(lambda x: (x+0.1)/(sum(x.values)+0.4) , axis=0)
pwm = pd.concat([pd.Series(map(lambda x : np.log2((x)/(expect_dict[row.Index])), row[1:])) for row in my_prop_matrix.itertuples()], axis=1).T
pwm.index = my_prop_matrix.index
pwm


np.random.seed(100500)

atg_sars = pd.read_csv('./sarsatg.tsv',sep='\t')
drop_lst = atg_sars.loc[atg_sars['Pattern'] == 'start-tr', 'Start']
good_lst = atg_sars.loc[ (~atg_sars['Start'].isin(drop_lst)&(atg_sars['Strand']=='+')),:]
with open('./sars.fasta', 'r') as file1:
    genome = ''.join([i[:-1] for i in file1.readlines() if i[0]!='>'])

sample = good_lst.sample(60, replace=False)
x_neg = [genome[int(row.Start-8):int(row.End+3)] for row in sample.itertuples()]


#Train_weights
train_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in x_train]
train_w
#Test_weights
test_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in x_test]
#Neg_contr_weights
neg_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in x_neg]

ans_table = pd.concat((pd.Series(train_w),pd.Series(test_w), pd.Series(neg_w)), axis=1)
ans_table.columns = ['train', 'test', 'neg_contr']


sns.set_theme()
fig = plt.figure(figsize=(8,6))
axes = plt.subplot()
sns.histplot(data = ans_table)

<AxesSubplot: ylabel='Count'>


sns.set_theme()
fig = plt.figure(figsize=(8,6))
axes = plt.subplot()
sns.boxplot(data = ans_table)

<AxesSubplot: >


new_prop = my_count_matrix.apply(lambda x: (x)/(sum(x.values)) , axis=0)
new_pwm = pd.concat([pd.Series(map(lambda x : np.log2((x)/(expect_dict[row.Index])), row[1:])) for row in new_prop.itertuples()], axis=1).T
new_pwm[new_pwm == -(np.inf)] = 0
new_pwm.index = new_prop.index
IC_matrix = new_prop * new_pwm # наверное, я не уверен)
IC_matrix.loc['IC(j)',:] = IC_matrix.sum(axis=0)
IC_matrix

/tmp/ipykernel_6483/1174954439.py:2: RuntimeWarning: divide by zero encountered in log2
  new_pwm = pd.concat([pd.Series(map(lambda x : np.log2((x)/(expect_dict[row.Index])), row[1:])) for row in new_prop.itertuples()], axis=1).T


from collections import defaultdict, Counter
from statsmodels.stats.proportion import proportions_ztest
with open('./sequence.fasta', 'r') as file1:
    genome = ''.join([i[:-1] for i in file1.readlines() if i[0]!='>'])
coli_exp = {'A':(1-0.508)/2, 'T':(1-0.508)/2, 'G':(0.508)/2, 'C':(0.508)/2 }    
dct = defaultdict(int)
k_mers = [genome[i:i+6] for i in range(len(genome))]
counts = Counter(k_mers)
ans_count = counts['GAATTC']
ans_prop = ans_count/sum(counts.values())
expected_prop = np.prod(np.array([coli_exp[i] for i in 'GAATTC']))
expected_prop

0.00023626960849209598


score, pval = proportions_ztest(count=ans_count, nobs=sum(counts.values()),value=expected_prop,  alternative='two-sided', prop_var = expected_prop)
pval # It's a literally zero

4.5837535844254235e-42


!jupyter nbconvert --to html pr6.ipynb

	0	1	2	3	4	5	6	7	8	9	10	11	12
A	0.136425	-0.557148	-0.557148	-0.238787	0.954751	-0.102582	-0.747251	1.750460	-6.896998	-6.896998	-0.238787	-0.238787	-1.539446
C	-0.032057	0.422509	0.661516	1.045945	-2.912476	0.959010	1.205522	-6.371907	-6.371907	-6.371907	-1.979590	1.045945	0.959010
G	-0.032057	0.959010	0.546956	0.422509	0.767644	-0.222160	0.286304	-6.371907	-6.371907	2.275551	1.479842	0.135887	0.767644
T	-0.102582	-1.224573	-0.747251	-2.504681	-2.504681	-0.966261	-1.539446	-6.896998	1.750460	-6.896998	-1.224573	-1.539446	-0.966261

	0	1	2	3	4	5	6	7	8	9	10	11	12
A	0.045411	-0.112143	-0.112143	-0.059697	0.553637	-0.027853	-0.131838	1.761213	0.000000	0.000000	-0.059697	-0.059697	-0.156071
C	-0.007125	0.116547	0.216065	0.447032	-0.075891	0.385750	0.575844	0.000000	0.000000	0.000000	-0.101781	0.447032	0.385750
G	-0.007125	0.385750	0.164802	0.116547	0.270106	-0.039947	0.071576	0.000000	0.000000	2.286304	0.855564	0.030218	0.270106
T	-0.027853	-0.154848	-0.131838	-0.128036	-0.128036	-0.146363	-0.156071	0.000000	1.761213	0.000000	-0.154848	-0.156071	-0.146363
IC(j)	0.003308	0.235306	0.136886	0.375846	0.619817	0.171588	0.359511	1.761213	1.761213	2.286304	0.539238	0.261481	0.353422

Задание 2¶

Время графиков, написанных в 2 строки!!!(4)¶

Задание 3¶

Задание 4¶