import requests
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
/home/bush29/anaconda3/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.2 warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
np.random.seed(100500)
human_genes = pd.read_csv('human-genes.tsv',sep='\t')
genes = human_genes.sample(300, replace=False)
lst_seq = []
server = 'https://rest.ensembl.org'
counter = 0
for row in tqdm(genes.itertuples()):
strand = 1 if row.strand == '+' else -1
coord = row.thickStart if strand == 1 else row.thickEnd
if strand == 1:
ext = f"/sequence/region/human/{row._1}:{coord-6}..{coord+6}:{strand}?expand_3prime=0;expand_5prime=0"
else:
ext = f"/sequence/region/human/{row._1}:{coord-5}..{coord+7}:{strand}?expand_3prime=0;expand_5prime=0"
r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
seq = r.text.split('\n')[1]
cond = ('ATG' == seq[7:10])
lst_seq.append(seq) if cond else None
counter += 1 if cond else 0
if counter == 100:
break
0it [00:00, ?it/s]
np.random.seed(100500)
np.random.shuffle(lst_seq)
x_train = lst_seq[:40]
x_test = lst_seq[40:]
len(x_test)
60
gc_content = 0.41 #according to wikipedia XD
expect_dict = {'A':(1-gc_content)/2, 'T':(1-gc_content)/2, 'G':gc_content/2, 'C':gc_content/2 }
assert sum(expect_dict.values())
my_matrix = pd.DataFrame([list(i) for i in x_train])
lst_series = []
for col_name in my_matrix.columns:
col = my_matrix.loc[:,col_name]
my_counts = pd.value_counts(col).sort_index()
lst_series.append(my_counts)
my_count_matrix = pd.concat(lst_series, axis=1).fillna(0)
my_prop_matrix = my_count_matrix.apply(lambda x: (x+0.1)/(sum(x.values)+0.4) , axis=0)
pwm = pd.concat([pd.Series(map(lambda x : np.log2((x)/(expect_dict[row.Index])), row[1:])) for row in my_prop_matrix.itertuples()], axis=1).T
pwm.index = my_prop_matrix.index
pwm
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
A | 0.136425 | -0.557148 | -0.557148 | -0.238787 | 0.954751 | -0.102582 | -0.747251 | 1.750460 | -6.896998 | -6.896998 | -0.238787 | -0.238787 | -1.539446 |
C | -0.032057 | 0.422509 | 0.661516 | 1.045945 | -2.912476 | 0.959010 | 1.205522 | -6.371907 | -6.371907 | -6.371907 | -1.979590 | 1.045945 | 0.959010 |
G | -0.032057 | 0.959010 | 0.546956 | 0.422509 | 0.767644 | -0.222160 | 0.286304 | -6.371907 | -6.371907 | 2.275551 | 1.479842 | 0.135887 | 0.767644 |
T | -0.102582 | -1.224573 | -0.747251 | -2.504681 | -2.504681 | -0.966261 | -1.539446 | -6.896998 | 1.750460 | -6.896998 | -1.224573 | -1.539446 | -0.966261 |
np.random.seed(100500)
atg_sars = pd.read_csv('./sarsatg.tsv',sep='\t')
drop_lst = atg_sars.loc[atg_sars['Pattern'] == 'start-tr', 'Start']
good_lst = atg_sars.loc[ (~atg_sars['Start'].isin(drop_lst)&(atg_sars['Strand']=='+')),:]
with open('./sars.fasta', 'r') as file1:
genome = ''.join([i[:-1] for i in file1.readlines() if i[0]!='>'])
sample = good_lst.sample(60, replace=False)
x_neg = [genome[int(row.Start-8):int(row.End+3)] for row in sample.itertuples()]
#Train_weights
train_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in x_train]
train_w
#Test_weights
test_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in x_test]
#Neg_contr_weights
neg_w = [sum([pwm.loc[value,ind] for ind,value in enumerate(seq)]) for seq in x_neg]
ans_table = pd.concat((pd.Series(train_w),pd.Series(test_w), pd.Series(neg_w)), axis=1)
ans_table.columns = ['train', 'test', 'neg_contr']
sns.set_theme()
fig = plt.figure(figsize=(8,6))
axes = plt.subplot()
sns.histplot(data = ans_table)
<AxesSubplot: ylabel='Count'>
sns.set_theme()
fig = plt.figure(figsize=(8,6))
axes = plt.subplot()
sns.boxplot(data = ans_table)
<AxesSubplot: >
new_prop = my_count_matrix.apply(lambda x: (x)/(sum(x.values)) , axis=0)
new_pwm = pd.concat([pd.Series(map(lambda x : np.log2((x)/(expect_dict[row.Index])), row[1:])) for row in new_prop.itertuples()], axis=1).T
new_pwm[new_pwm == -(np.inf)] = 0
new_pwm.index = new_prop.index
IC_matrix = new_prop * new_pwm # наверное, я не уверен)
IC_matrix.loc['IC(j)',:] = IC_matrix.sum(axis=0)
IC_matrix
/tmp/ipykernel_6483/1174954439.py:2: RuntimeWarning: divide by zero encountered in log2 new_pwm = pd.concat([pd.Series(map(lambda x : np.log2((x)/(expect_dict[row.Index])), row[1:])) for row in new_prop.itertuples()], axis=1).T
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
A | 0.045411 | -0.112143 | -0.112143 | -0.059697 | 0.553637 | -0.027853 | -0.131838 | 1.761213 | 0.000000 | 0.000000 | -0.059697 | -0.059697 | -0.156071 |
C | -0.007125 | 0.116547 | 0.216065 | 0.447032 | -0.075891 | 0.385750 | 0.575844 | 0.000000 | 0.000000 | 0.000000 | -0.101781 | 0.447032 | 0.385750 |
G | -0.007125 | 0.385750 | 0.164802 | 0.116547 | 0.270106 | -0.039947 | 0.071576 | 0.000000 | 0.000000 | 2.286304 | 0.855564 | 0.030218 | 0.270106 |
T | -0.027853 | -0.154848 | -0.131838 | -0.128036 | -0.128036 | -0.146363 | -0.156071 | 0.000000 | 1.761213 | 0.000000 | -0.154848 | -0.156071 | -0.146363 |
IC(j) | 0.003308 | 0.235306 | 0.136886 | 0.375846 | 0.619817 | 0.171588 | 0.359511 | 1.761213 | 1.761213 | 2.286304 | 0.539238 | 0.261481 | 0.353422 |
from collections import defaultdict, Counter
from statsmodels.stats.proportion import proportions_ztest
with open('./sequence.fasta', 'r') as file1:
genome = ''.join([i[:-1] for i in file1.readlines() if i[0]!='>'])
coli_exp = {'A':(1-0.508)/2, 'T':(1-0.508)/2, 'G':(0.508)/2, 'C':(0.508)/2 }
dct = defaultdict(int)
k_mers = [genome[i:i+6] for i in range(len(genome))]
counts = Counter(k_mers)
ans_count = counts['GAATTC']
ans_prop = ans_count/sum(counts.values())
expected_prop = np.prod(np.array([coli_exp[i] for i in 'GAATTC']))
expected_prop
0.00023626960849209598
score, pval = proportions_ztest(count=ans_count, nobs=sum(counts.values()),value=expected_prop, alternative='two-sided', prop_var = expected_prop)
pval # It's a literally zero
4.5837535844254235e-42
!jupyter nbconvert --to html pr6.ipynb