import pandas as pd
import numpy as np
import requests, sys
from collections import Counter
server = 'https://rest.ensembl.org'
import matplotlib.pyplot as plt
from numpy import log2, log
bases = ['A','T','G','C']
# https://www.ncbi.nlm.nih.gov/genome/51
GC = 0.396
fr_gc = GC/2
fr_at=(1-GC)/2
chrN = 'X'
strain = str(1)
expand_3prime = 0
expand_5prime = 0
df = pd.read_csv('human-genes.tsv', '\t')
df2 = df[(df['#chrom'] == 'chrX') & (df['strand'] == '+') & (df['len'] >3000)].iloc[0:100]
# В таблице координаты начала первого кодона ATG указаны как Thickstart если на прямой цепи
#chrom | chromStart | chromEnd | name | strand | len | thickStart | thickEnd | reserved | len-thick | ... | exonFrames | type | geneName | geneName2 | transcriptClass | source | transcriptType | tag | level | tier | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
42727 | chrX | 276321 | 303353 | ENST00000399012.6 | + | 27033 | 284187 | 299335 | 789624 | 15149 | ... | -1,-1,0,1,0,0,0,1, | none | PLCXD1 | Q9NUJ7 | coding | havana_homo_sapiens | protein_coding | CCDS,PAR,appris_principal_1,basic | 2 | canonical,basic,all,all |
42731 | chrX | 624343 | 646823 | ENST00000381578.6 | + | 22481 | 630897 | 644636 | 789624 | 13740 | ... | -1,0,1,0,1,0, | none | SHOX | O15266 | coding | havana_homo_sapiens | protein_coding | CCDS,PAR,appris_principal_1,basic | 2 | canonical,basic,all,all |
42733 | chrX | 1268799 | 1309935 | ENST00000417535.7 | + | 41137 | 1282703 | 1309479 | 789624 | 26777 | ... | -1,-1,0,1,0,1,2,1,0,0,1,1,2,0, | none | CSF2RA | P15509 | coding | ensembl_homo_sapiens | protein_coding | CCDS,PAR,appris_alternative_2,basic,ncRNA_host | 3 | basic,all,all |
42734 | chrX | 1336615 | 1382689 | ENST00000381469.7 | + | 46075 | 1341765 | 1382465 | 789624 | 40701 | ... | -1,0,1,2,1,0,0,1,2,0, | none | IL3RA | P26951 | coding | ensembl_homo_sapiens | protein_coding | CCDS,PAR,basic | 3 | basic,all,all |
42739 | chrX | 1591603 | 1602520 | ENST00000313871.9 | + | 10918 | 1593462 | 1601594 | 789624 | 8133 | ... | -1,0,0,2,0, | none | AKAP17A | Q02040 | coding | ensembl_havana_transcript_homo_sapiens | protein_coding | CCDS,Ensembl_canonical,MANE_Select,PAR,appris_... | 2 | canonical,basic,all,all |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
44236 | chrX | 155612587 | 155776795 | ENST00000675360.1 | + | 164209 | 155773871 | 155774738 | 789624 | 868 | ... | -1,-1,-1,0, | none | SPRY3 | O43610 | coding | havana_homo_sapiens | protein_coding | CCDS,Ensembl_canonical,RNA_Seq_supported_only,... | 2 | canonical,basic,all |
44237 | chrX | 155881344 | 155943769 | ENST00000286448.12 | + | 62426 | 155889466 | 155941951 | 789624 | 52486 | ... | -1,0,2,0,0,1,0,0, | none | VAMP7 | P51809 | coding | ensembl_havana_transcript_homo_sapiens | protein_coding | CCDS,Ensembl_canonical,MANE_Select,PAR,appris_... | 2 | canonical,basic,all,all |
44238 | chrX | 155997705 | 156010817 | ENST00000369423.7 | + | 13113 | 155997743 | 156009852 | 789624 | 12110 | ... | 0,2,1,1,2,0,0,1,2, | none | IL9R | Q01113 | coding | ensembl_havana_transcript_homo_sapiens | protein_coding | CCDS,PAR,basic,overlapping_locus | 2 | basic,all,all |
44239 | chrX | 155997695 | 156010817 | ENST00000244174.11 | + | 13123 | 155997759 | 156010409 | 789624 | 12651 | ... | 0,1,1,2,1,0,1,2,0, | none | IL9R | Q01113 | coding | ensembl_havana_transcript_homo_sapiens | protein_coding | CCDS,Ensembl_canonical,MANE_Select,PAR,appris_... | 2 | canonical,basic,all,all |
44240 | chrX | 156020960 | 156025374 | ENST00000359512.8 | + | 4415 | 156020960 | 156025374 | 789624 | 4415 | ... | 0,2,0,0,2,0,1,1,1,1, | none | WASH6P | A0A7I2YQU6 | coding | havana_homo_sapiens | protein_coding | Ensembl_canonical,PAR,appris_principal_1,basic... | 2 | canonical,basic,all,all |
593 rows × 24 columns
positive = df[(df['#chrom'] == 'chrX') & (df['strand'] == '+')].iloc[0:100]
negative_df = df[(df['#chrom'] == 'chrX') & (df['strand'] == '+') & (df['len'] >3000)]
with open ('train.txt', 'w') as train, open ('test.txt', 'w') as test:
c = 0
for coord in df2['coords']:
ext = f"/sequence/region/human/{chrN}:{coord}:{strain}?expand_3prime={expand_3prime};expand_5prime={expand_5prime}"
r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
if not r.ok:
r.raise_for_status()
sys.exit()
if c< 40:
c+=1
print(r.text.split()[1], file = train)
else:
print(r.text.split()[1], file = test)
with open ('negative.txt', 'w') as negative:
for coord in negative_df['coords']:
ext = f"/sequence/region/human/{chrN}:{coord}:{strain}?expand_3prime={expand_3prime};expand_5prime={expand_5prime}"
r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
if not r.ok:
r.raise_for_status()
sys.exit()
t = r.text.split()[1]
# print (t)
if 'ATG' in t:
print(r.text.split()[1], file = negative)
# with open ('train.txt', 'r') as f, open ('train.txt', 'w') as out:
# for line in f:
# line = line.strip()
# if line.startswith('>'):
# pass
# else:
# print (line.strip(), file = out)
https://rest.ensembl.org/sequence/region/human/{номер хромосомы}:{thickStart}..{thickEnd}:{ прямая или обратная цепь : 1, -1}?expand_3prime={кол-во нуклеотидов после стоп -кодона};expand_5prime={кол-во нуклеотидов перед ATG}
with open ('train.txt', 'r') as f:
list_of_lists = []
for line in f:
new_list = list(line.strip())
# new_list = [elem for elem in line.split().s]
list_of_lists.append(new_list)
MT = np.array(list_of_lists).transpose()
type(list(MT[1]))
N = len(MT[0])
def MT(file):
with open (file, 'r') as f:
MT = np.array([list(seq.strip()) for seq in f]).transpose()
N = len(list(MT[0]))
return N,MT
def COUNT_BASES(MT):
count_bases = {}
for c, lst in enumerate(MT):
res = {base: (list(lst)).count(base) for base in bases}
# list(lst), потому что lst - numpy array
count_bases[c] = res
return count_bases
def PWM(count_bases):
F_table = {}
e = 0.25
for i, d2 in count_bases.items():
res = {}
for base, count in d2.items():
if (base == 'G') or (base == 'C'):
fr = fr_gc
elif (base == 'A') or (base == 'T'):
fr = fr_at
pwm = round(log(((count+e)/(N+1))/fr),4)
res[base] = pwm
F_table[i+1] = res
return F_table
def ICbj (count_bases,N):
IC_table = {}
IC = 0
for i, d2 in count_bases.items():
res = {}
for base, count in d2.items():
if (base == 'G') or (base == 'C'):
fr = fr_gc
elif (base == 'A') or (base == 'T'):
fr = fr_at
# ic = round((count/N)*log2(((count/N)/fr)),3)
l = np.where(((count/N)/fr)<=0, 0, log2((count/N)/fr))
ic = round(((count/N)*l),3)
res[base] = ic
IC += ic
IC_table[i+1] = res
return IC_table,round(IC,2)
n, mt = MT('train.txt')
count_bases = COUNT_BASES(mt)
pwm = PWM(count_bases)
ic_tab, IC = ICbj(count_bases, n)
ic_tab, IC
({1: {'A': -0.096, 'T': -0.119, 'G': 0.232, 'C': 0.084}, 2: {'A': -0.096, 'T': -0.159, 'G': 0.232, 'C': 0.232}, 3: {'A': -0.119, 'T': -0.151, 'G': 0.232, 'C': 0.232}, 4: {'A': -0.096, 'T': -0.119, 'G': 0.003, 'C': 0.346}, 5: {'A': 0.259, 'T': -0.119, 'G': 0.084, 'C': -0.099}, 6: {'A': -0.003, 'T': -0.151, 'G': -0.031, 'C': 0.346}, 7: {'A': -0.151, 'T': -0.159, 'G': 0.232, 'C': 0.468}, 8: {'A': 1.343, 'T': -0.09, 'G': -0.105, 'C': -0.075}, 9: {'A': -0.09, 'T': 1.418, 'G': -0.075, 'C': -0.099}, 10: {'A': -0.09, 'T': -0.13, 'G': 1.966, 'C': -0.075}, 11: {'A': 0.034, 'T': -0.151, 'G': 0.533, 'C': -0.06}, 12: {'A': -0.096, 'T': -0.159, 'G': 0.18, 'C': 0.288}, 13: {'A': -0.068, 'T': -0.096, 'G': 0.13, 'C': 0.084}}, 5.83)
pd.DataFrame.from_dict(pwm)
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
A | -0.2916 | -0.2916 | -0.4060 | -0.2916 | 0.3879 | -0.0107 | -0.6837 | 1.0462 | -2.2931 | -2.2931 | 0.0678 | -0.2916 | -0.1890 |
T | -0.4060 | -0.8580 | -0.6837 | -0.4060 | -0.4060 | -0.6837 | -1.0693 | -2.2931 | 1.0742 | -1.7053 | -1.3376 | -0.8580 | -0.2916 |
G | 0.4899 | 0.4899 | 0.4899 | 0.0161 | 0.2332 | -0.1131 | 0.4899 | -0.9154 | -1.8709 | 1.4964 | 0.8101 | 0.4114 | 0.3263 |
C | 0.2332 | 0.4899 | 0.4899 | 0.6305 | -0.6472 | 0.6305 | 0.7537 | -1.8709 | -1.2832 | -1.8709 | -0.2615 | 0.5627 | 0.2332 |
ICDF = pd.DataFrame.from_dict(ic_tab)
ICDF.to_html()
'<table border="1" class="dataframe">\n <thead>\n <tr style="text-align: right;">\n <th></th>\n <th>1</th>\n <th>2</th>\n <th>3</th>\n <th>4</th>\n <th>5</th>\n <th>6</th>\n <th>7</th>\n <th>8</th>\n <th>9</th>\n <th>10</th>\n <th>11</th>\n <th>12</th>\n <th>13</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>A</th>\n <td>-0.096</td>\n <td>-0.096</td>\n <td>-0.119</td>\n <td>-0.096</td>\n <td>0.259</td>\n <td>-0.003</td>\n <td>-0.151</td>\n <td>1.343</td>\n <td>-0.090</td>\n <td>-0.090</td>\n <td>0.034</td>\n <td>-0.096</td>\n <td>-0.068</td>\n </tr>\n <tr>\n <th>T</th>\n <td>-0.119</td>\n <td>-0.159</td>\n <td>-0.151</td>\n <td>-0.119</td>\n <td>-0.119</td>\n <td>-0.151</td>\n <td>-0.159</td>\n <td>-0.090</td>\n <td>1.418</td>\n <td>-0.130</td>\n <td>-0.151</td>\n <td>-0.159</td>\n <td>-0.096</td>\n </tr>\n <tr>\n <th>G</th>\n <td>0.232</td>\n <td>0.232</td>\n <td>0.232</td>\n <td>0.003</td>\n <td>0.084</td>\n <td>-0.031</td>\n <td>0.232</td>\n <td>-0.105</td>\n <td>-0.075</td>\n <td>1.966</td>\n <td>0.533</td>\n <td>0.180</td>\n <td>0.130</td>\n </tr>\n <tr>\n <th>C</th>\n <td>0.084</td>\n <td>0.232</td>\n <td>0.232</td>\n <td>0.346</td>\n <td>-0.099</td>\n <td>0.346</td>\n <td>0.468</td>\n <td>-0.075</td>\n <td>-0.099</td>\n <td>-0.075</td>\n <td>-0.060</td>\n <td>0.288</td>\n <td>0.084</td>\n </tr>\n </tbody>\n</table>'
d = pd.DataFrame.from_dict(icbj)
d
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
A | -0.065 | -0.157 | -0.065 | -0.092 | 0.123 | -0.065 | -0.135 | 1.742 | 0.000 | 0.000 | 0.001 | -0.065 | -0.157 |
T | -0.065 | -0.092 | -0.149 | -0.150 | -0.158 | -0.116 | 0.000 | 0.000 | 1.742 | 0.000 | -0.116 | -0.157 | 0.039 |
G | 0.037 | 0.280 | 0.124 | 0.037 | 0.397 | -0.063 | 0.280 | 0.000 | 0.000 | 2.315 | 0.337 | 0.124 | 0.280 |
C | 0.124 | 0.173 | 0.225 | 0.589 | -0.086 | 0.397 | 0.589 | 0.000 | 0.000 | 0.000 | -0.086 | 0.280 | -0.001 |
Чем больше IC(j), тем больше частоты букв в
колонке отличаются от ожидаемых, тем больше
информации в колонке
F(b,j) = [N(b,j) + e] /(N + e) F = d[ /len(d)+e
w(G,15) = ln(0.38/0.35) = 0.1 w(G,15) = ln(мое число из таблички/fr_gc)
with open ('test.txt', 'r') as f, open ('res_test.txt', 'w') as res_t:
c = 0
rtest_ar = []
for line in f:
line = line.strip()
res = 0
c+=1
for pos,ch in enumerate(line):
res += w.at[ch, pos+1]
# print (w.at[ch, pos+1], sep = '', end = ' ')
# print ('\n', res)
print (round(res,2), file = res_t)
rtest_ar.append(round(res,2))
ar = np.asarray(rtest_ar)
ar
# для каждой позиции буквы в строчке - найти ее в столбце индекса таблицы и для ее номера найти число. просуммировать в рез
array([-7.4, 2.4, 4.8, 4.1, 5.4, 3.9, 4.1, 2.7, 3.1, 3.7, 2.7, 1.5, 5.2, 5.7, -4.2, 2.5, 4.9, 7.5, 6.7, 4.6, 4.3, 4.5, 3.7, 5.1, 6.4, 6.4, 6.2, 4.9, 3.1, 3.8, 3.5, 4.5, 5.3, 2.7, 5. , 3.4, 5.2, 0.3, -0.8, 5.5, 6.5, 5.4, -1.8, 2.7, 0.8, 2.9, 4.7, 3.2, -0.7, 5.8, 4.8, -1.6, 3.4, 5.2, 3.6, -2.1, 7.8, -4.7, 4.5, 2.9])
plt.boxplot(ar)
{'whiskers': [<matplotlib.lines.Line2D at 0x7f9a24f8e7b8>, <matplotlib.lines.Line2D at 0x7f9a24f8e160>], 'caps': [<matplotlib.lines.Line2D at 0x7f9a24f8c978>, <matplotlib.lines.Line2D at 0x7f9a24f8c4a8>], 'boxes': [<matplotlib.lines.Line2D at 0x7f9a24f8e860>], 'medians': [<matplotlib.lines.Line2D at 0x7f9a24f8bf98>], 'fliers': [<matplotlib.lines.Line2D at 0x7f9a24f8bcc0>], 'means': []}
NEGATIVE CONTROL
Вариант 2 положительный контроль множество тестовое генов человека отрицательный отберите сами ATG, которые не являются стартами трансляции, т.е. лежат вне кодиующей последовательности или наоборот лежт внутри гена.
Сравните распределения весов для положительного и отрицательного контроля и сделайте выводы.
df = pd.read_csv('human-genes.tsv', '\t')
df2 = df[(df['#chrom'] == 'chrX') & (df['strand'] == '+')].iloc[0:150]
df2 = df2[df2.len > 1000]
# В таблице координаты начала первого кодона ATG указаны как Thickstart если на прямой цепи
df2.shape
(144, 24)
# df3 = df2.drop([df2.len < 1000].index)
# # df = df.drop([df2.score < 50].index)
df2['coords'] = (df2['thickStart'] + 188).astype(str).str.cat((df2['thickStart']+200).astype(str), sep ="..")
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy """Entry point for launching an IPython kernel.
42727 284375..284387 42731 631085..631097 42733 1282891..1282903 42734 1341953..1341965 42739 1593650..1593662 ... 44236 155774059..155774071 44237 155889654..155889666 44238 155997931..155997943 44239 155997947..155997959 44240 156021148..156021160 Name: coords, Length: 673, dtype: object
df2
#chrom | chromStart | chromEnd | name | strand | len | thickStart | thickEnd | reserved | len-thick | ... | type | geneName | geneName2 | transcriptClass | source | transcriptType | tag | level | tier | coords | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
42727 | chrX | 276321 | 303353 | ENST00000399012.6 | + | 27033 | 284187 | 299335 | 789624 | 15149 | ... | none | PLCXD1 | Q9NUJ7 | coding | havana_homo_sapiens | protein_coding | CCDS,PAR,appris_principal_1,basic | 2 | canonical,basic,all,all | 284375..284387 |
42731 | chrX | 624343 | 646823 | ENST00000381578.6 | + | 22481 | 630897 | 644636 | 789624 | 13740 | ... | none | SHOX | O15266 | coding | havana_homo_sapiens | protein_coding | CCDS,PAR,appris_principal_1,basic | 2 | canonical,basic,all,all | 631085..631097 |
42733 | chrX | 1268799 | 1309935 | ENST00000417535.7 | + | 41137 | 1282703 | 1309479 | 789624 | 26777 | ... | none | CSF2RA | P15509 | coding | ensembl_homo_sapiens | protein_coding | CCDS,PAR,appris_alternative_2,basic,ncRNA_host | 3 | basic,all,all | 1282891..1282903 |
42734 | chrX | 1336615 | 1382689 | ENST00000381469.7 | + | 46075 | 1341765 | 1382465 | 789624 | 40701 | ... | none | IL3RA | P26951 | coding | ensembl_homo_sapiens | protein_coding | CCDS,PAR,basic | 3 | basic,all,all | 1341953..1341965 |
42739 | chrX | 1591603 | 1602520 | ENST00000313871.9 | + | 10918 | 1593462 | 1601594 | 789624 | 8133 | ... | none | AKAP17A | Q02040 | coding | ensembl_havana_transcript_homo_sapiens | protein_coding | CCDS,Ensembl_canonical,MANE_Select,PAR,appris_... | 2 | canonical,basic,all,all | 1593650..1593662 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
44236 | chrX | 155612587 | 155776795 | ENST00000675360.1 | + | 164209 | 155773871 | 155774738 | 789624 | 868 | ... | none | SPRY3 | O43610 | coding | havana_homo_sapiens | protein_coding | CCDS,Ensembl_canonical,RNA_Seq_supported_only,... | 2 | canonical,basic,all | 155774059..155774071 |
44237 | chrX | 155881344 | 155943769 | ENST00000286448.12 | + | 62426 | 155889466 | 155941951 | 789624 | 52486 | ... | none | VAMP7 | P51809 | coding | ensembl_havana_transcript_homo_sapiens | protein_coding | CCDS,Ensembl_canonical,MANE_Select,PAR,appris_... | 2 | canonical,basic,all,all | 155889654..155889666 |
44238 | chrX | 155997705 | 156010817 | ENST00000369423.7 | + | 13113 | 155997743 | 156009852 | 789624 | 12110 | ... | none | IL9R | Q01113 | coding | ensembl_havana_transcript_homo_sapiens | protein_coding | CCDS,PAR,basic,overlapping_locus | 2 | basic,all,all | 155997931..155997943 |
44239 | chrX | 155997695 | 156010817 | ENST00000244174.11 | + | 13123 | 155997759 | 156010409 | 789624 | 12651 | ... | none | IL9R | Q01113 | coding | ensembl_havana_transcript_homo_sapiens | protein_coding | CCDS,Ensembl_canonical,MANE_Select,PAR,appris_... | 2 | canonical,basic,all,all | 155997947..155997959 |
44240 | chrX | 156020960 | 156025374 | ENST00000359512.8 | + | 4415 | 156020960 | 156025374 | 789624 | 4415 | ... | none | WASH6P | A0A7I2YQU6 | coding | havana_homo_sapiens | protein_coding | Ensembl_canonical,PAR,appris_principal_1,basic... | 2 | canonical,basic,all,all | 156021148..156021160 |
673 rows × 25 columns
сhrN = 'X'
strand = str(1)
strand_df = '+'
expand_3prime = 0
expand_5prime = 0
with open ('negative.txt', 'w') as neg:
c = 0
for coord in df2['coords']:
ext = f"/sequence/region/human/{chrN}:{coord}:{strain}?expand_3prime={expand_3prime};expand_5prime={expand_5prime}"
r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
if not r.ok:
r.raise_for_status()
sys.exit()
t = r.text.split()[1]
# print (t)
if 'ATG' in t:
print(r.text.split()[1], file = neg)
df2
#chrom | chromStart | chromEnd | name | strand | len | thickStart | thickEnd | reserved | len-thick | ... | type | geneName | geneName2 | transcriptClass | source | transcriptType | tag | level | tier | coords | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
42727 | chrX | 276321 | 303353 | ENST00000399012.6 | + | 27033 | 284187 | 299335 | 789624 | 15149 | ... | none | PLCXD1 | Q9NUJ7 | coding | havana_homo_sapiens | protein_coding | CCDS,PAR,appris_principal_1,basic | 2 | canonical,basic,all,all | 284375..284387 |
42731 | chrX | 624343 | 646823 | ENST00000381578.6 | + | 22481 | 630897 | 644636 | 789624 | 13740 | ... | none | SHOX | O15266 | coding | havana_homo_sapiens | protein_coding | CCDS,PAR,appris_principal_1,basic | 2 | canonical,basic,all,all | 631085..631097 |
42733 | chrX | 1268799 | 1309935 | ENST00000417535.7 | + | 41137 | 1282703 | 1309479 | 789624 | 26777 | ... | none | CSF2RA | P15509 | coding | ensembl_homo_sapiens | protein_coding | CCDS,PAR,appris_alternative_2,basic,ncRNA_host | 3 | basic,all,all | 1282891..1282903 |
42734 | chrX | 1336615 | 1382689 | ENST00000381469.7 | + | 46075 | 1341765 | 1382465 | 789624 | 40701 | ... | none | IL3RA | P26951 | coding | ensembl_homo_sapiens | protein_coding | CCDS,PAR,basic | 3 | basic,all,all | 1341953..1341965 |
42739 | chrX | 1591603 | 1602520 | ENST00000313871.9 | + | 10918 | 1593462 | 1601594 | 789624 | 8133 | ... | none | AKAP17A | Q02040 | coding | ensembl_havana_transcript_homo_sapiens | protein_coding | CCDS,Ensembl_canonical,MANE_Select,PAR,appris_... | 2 | canonical,basic,all,all | 1593650..1593662 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
44236 | chrX | 155612587 | 155776795 | ENST00000675360.1 | + | 164209 | 155773871 | 155774738 | 789624 | 868 | ... | none | SPRY3 | O43610 | coding | havana_homo_sapiens | protein_coding | CCDS,Ensembl_canonical,RNA_Seq_supported_only,... | 2 | canonical,basic,all | 155774059..155774071 |
44237 | chrX | 155881344 | 155943769 | ENST00000286448.12 | + | 62426 | 155889466 | 155941951 | 789624 | 52486 | ... | none | VAMP7 | P51809 | coding | ensembl_havana_transcript_homo_sapiens | protein_coding | CCDS,Ensembl_canonical,MANE_Select,PAR,appris_... | 2 | canonical,basic,all,all | 155889654..155889666 |
44238 | chrX | 155997705 | 156010817 | ENST00000369423.7 | + | 13113 | 155997743 | 156009852 | 789624 | 12110 | ... | none | IL9R | Q01113 | coding | ensembl_havana_transcript_homo_sapiens | protein_coding | CCDS,PAR,basic,overlapping_locus | 2 | basic,all,all | 155997931..155997943 |
44239 | chrX | 155997695 | 156010817 | ENST00000244174.11 | + | 13123 | 155997759 | 156010409 | 789624 | 12651 | ... | none | IL9R | Q01113 | coding | ensembl_havana_transcript_homo_sapiens | protein_coding | CCDS,Ensembl_canonical,MANE_Select,PAR,appris_... | 2 | canonical,basic,all,all | 155997947..155997959 |
44240 | chrX | 156020960 | 156025374 | ENST00000359512.8 | + | 4415 | 156020960 | 156025374 | 789624 | 4415 | ... | none | WASH6P | A0A7I2YQU6 | coding | havana_homo_sapiens | protein_coding | Ensembl_canonical,PAR,appris_principal_1,basic... | 2 | canonical,basic,all,all | 156021148..156021160 |
673 rows × 25 columns
df2 = df[(df['#chrom'] == 'chrX') & (df['strand'] == '+') & (df['len'] >= 1000)].reset_index()
df2['coords'] = (df2['thickStart'] + 188).astype(str).str.cat((df2['thickStart']+200).astype(str), sep ="..")
df2.at[0,'coords']
# & df['len'] > 1000
'284375..284387'
c = 0
with open ('negative.txt', 'w') as neg:
с = 0
i = 0
while c < 80:
# for coord in df2['coords']:
coord = df2.at[i,'coords']
i += 1
ext = f"/sequence/region/human/{chrN}:{coord}:{strain}?expand_3prime={expand_3prime};expand_5prime={expand_5prime}"
r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
if not r.ok:
r.raise_for_status()
sys.exit()
t = r.text.split()[1]
# print (t)
if 'ATG' in t:
print(r.text.split()[1], file = neg)
c+=1
print ('ATG:', c, end = '...')
print (i, end = '...')
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) /usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3079 try: -> 3080 return self._engine.get_loc(casted_key) 3081 except KeyError as err: pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item() KeyError: 'coords' The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) <ipython-input-338-1f2171834f06> in <module> 5 while c < 80: 6 # for coord in df2['coords']: ----> 7 coord = df2.at[i,'coords'] 8 i += 1 9 ext = f"/sequence/region/human/{chrN}:{coord}:{strain}?expand_3prime={expand_3prime};expand_5prime={expand_5prime}" /usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in __getitem__(self, key) 2154 return self.obj.loc[key] 2155 -> 2156 return super().__getitem__(key) 2157 2158 def __setitem__(self, key, value): /usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in __getitem__(self, key) 2101 2102 key = self._convert_key(key) -> 2103 return self.obj._get_value(*key, takeable=self._takeable) 2104 2105 def __setitem__(self, key, value): /usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in _get_value(self, index, col, takeable) 3127 return series._values[index] 3128 -> 3129 series = self._get_item_cache(col) 3130 engine = self.index._engine 3131 /usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in _get_item_cache(self, item) 3790 # pending resolution of GH#33047 3791 -> 3792 loc = self.columns.get_loc(item) 3793 values = self._mgr.iget(loc) 3794 res = self._box_col_values(values, loc).__finalize__(self) /usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3080 return self._engine.get_loc(casted_key) 3081 except KeyError as err: -> 3082 raise KeyError(key) from err 3083 3084 if tolerance is not None: KeyError: 'coords'
# отобрала 60 из этих
Negative testing PWM
with open ('negative.txt', 'r') as f, open ('res_negative.txt', 'w') as res_t:
c = 0
rtest_ar = []
for line in f:
line = line.strip()
res = 0
c+=1
for pos,ch in enumerate(line):
res += w.at[ch, pos+1]
print (res)
# print (w.at[ch, pos+1], sep = '', end = ' ')
# print ('\n', res)
# print (round(res,2), file = res_t)
rtest_ar.append(round(res,2))
ar_neg = np.asarray(rtest_ar)
ar_neg
# для каждой позиции буквы в строчке - найти ее в столбце индекса таблицы и для ее номера найти число. просуммировать в рез
0.5 -0.19999999999999996 -0.49999999999999994 -0.8999999999999999 -1.0999999999999999 -1.7999999999999998 -0.6999999999999997 -2.0999999999999996 -0.5999999999999996 -2.0999999999999996 -1.4999999999999996 -1.2999999999999996
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) /usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3079 try: -> 3080 return self._engine.get_loc(casted_key) 3081 except KeyError as err: pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item() KeyError: 13 The above exception was the direct cause of the following exception: KeyError Traceback (most recent call last) <ipython-input-337-100c51d7f38e> in <module> 7 c+=1 8 for pos,ch in enumerate(line): ----> 9 res += w.at[ch, pos+1] 10 print (res) 11 # print (w.at[ch, pos+1], sep = '', end = ' ') /usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in __getitem__(self, key) 2154 return self.obj.loc[key] 2155 -> 2156 return super().__getitem__(key) 2157 2158 def __setitem__(self, key, value): /usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in __getitem__(self, key) 2101 2102 key = self._convert_key(key) -> 2103 return self.obj._get_value(*key, takeable=self._takeable) 2104 2105 def __setitem__(self, key, value): /usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in _get_value(self, index, col, takeable) 3127 return series._values[index] 3128 -> 3129 series = self._get_item_cache(col) 3130 engine = self.index._engine 3131 /usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in _get_item_cache(self, item) 3790 # pending resolution of GH#33047 3791 -> 3792 loc = self.columns.get_loc(item) 3793 values = self._mgr.iget(loc) 3794 res = self._box_col_values(values, loc).__finalize__(self) /usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance) 3080 return self._engine.get_loc(casted_key) 3081 except KeyError as err: -> 3082 raise KeyError(key) from err 3083 3084 if tolerance is not None: KeyError: 13
data_2d=[ar,ar_neg]
plt.boxplot(data_2d)
plt.title("Сигнал в геноме: ATG")
plt.show()
a = '40817810538111888575'
len(a)
20