import pandas as pd
import numpy as np
import requests, sys
from collections import Counter 
server = 'https://rest.ensembl.org'
import matplotlib.pyplot as plt 
from numpy import log2, log


bases = ['A','T','G','C']
# https://www.ncbi.nlm.nih.gov/genome/51
GC = 0.396
fr_gc = GC/2
fr_at=(1-GC)/2


chrN = 'X'
strain = str(1)
expand_3prime = 0
expand_5prime = 0


df = pd.read_csv('human-genes.tsv', '\t')
df2 = df[(df['#chrom'] == 'chrX') & (df['strand'] == '+') & (df['len'] >3000)].iloc[0:100]
# В таблице координаты начала первого кодона ATG указаны как Thickstart если на прямой цепи


positive = df[(df['#chrom'] == 'chrX') & (df['strand'] == '+')].iloc[0:100]
negative_df = df[(df['#chrom'] == 'chrX') & (df['strand'] == '+') & (df['len'] >3000)]


with open ('train.txt', 'w') as train, open ('test.txt', 'w') as test:
    c = 0
    for coord in df2['coords']:
        ext = f"/sequence/region/human/{chrN}:{coord}:{strain}?expand_3prime={expand_3prime};expand_5prime={expand_5prime}"
        r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
        if not r.ok:
            r.raise_for_status()
            sys.exit()
        if c< 40:
            c+=1
            print(r.text.split()[1], file = train)
        else:
            print(r.text.split()[1], file = test)


with open ('negative.txt', 'w') as negative:
    for coord in negative_df['coords']:
        ext = f"/sequence/region/human/{chrN}:{coord}:{strain}?expand_3prime={expand_3prime};expand_5prime={expand_5prime}"
        r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
        if not r.ok:
            r.raise_for_status()
            sys.exit()
        t = r.text.split()[1]
#         print (t)
        if 'ATG' in t:
            print(r.text.split()[1], file = negative)


# with open ('train.txt', 'r') as f, open ('train.txt', 'w') as out:
#     for line in f:
#         line = line.strip()
#         if line.startswith('>'):
#             pass
#         else:
#             print (line.strip(), file = out)


with open ('train.txt', 'r') as f:
    list_of_lists = []
    for line in f:
        new_list = list(line.strip())
#         new_list = [elem for elem in line.split().s]
        list_of_lists.append(new_list)
MT = np.array(list_of_lists).transpose()

type(list(MT[1]))
N = len(MT[0])


def MT(file):
    with open (file, 'r') as f:
        MT = np.array([list(seq.strip()) for seq in f]).transpose()
    N = len(list(MT[0]))
    return N,MT


def COUNT_BASES(MT):
    count_bases = {}
    for c, lst in enumerate(MT):
        res = {base: (list(lst)).count(base) for base in bases}
    #     list(lst), потому что lst - numpy array
        count_bases[c] = res
    return count_bases


def PWM(count_bases):
    F_table = {}
    e = 0.25
    for i, d2 in count_bases.items():
        res = {}
        for base, count in d2.items():
            if (base == 'G') or (base == 'C'):
                fr = fr_gc
            elif (base == 'A') or (base == 'T'):
                fr = fr_at            
            pwm = round(log(((count+e)/(N+1))/fr),4)
            
            res[base] = pwm
        F_table[i+1] = res   
    return F_table


def ICbj (count_bases,N):
    IC_table = {}
    IC = 0
    for i, d2 in count_bases.items():
        res = {}
        for base, count in d2.items():
            if (base == 'G') or (base == 'C'):
                fr = fr_gc
            elif (base == 'A') or (base == 'T'):
                fr = fr_at            
#                 ic = round((count/N)*log2(((count/N)/fr)),3)
            l = np.where(((count/N)/fr)<=0, 0, log2((count/N)/fr))
            ic = round(((count/N)*l),3)
            res[base] =  ic
            IC += ic
        IC_table[i+1] = res 
        
    return IC_table,round(IC,2)


n, mt = MT('train.txt')
count_bases = COUNT_BASES(mt)
pwm = PWM(count_bases) 
ic_tab, IC = ICbj(count_bases, n)
ic_tab, IC

({1: {'A': -0.096, 'T': -0.119, 'G': 0.232, 'C': 0.084},
  2: {'A': -0.096, 'T': -0.159, 'G': 0.232, 'C': 0.232},
  3: {'A': -0.119, 'T': -0.151, 'G': 0.232, 'C': 0.232},
  4: {'A': -0.096, 'T': -0.119, 'G': 0.003, 'C': 0.346},
  5: {'A': 0.259, 'T': -0.119, 'G': 0.084, 'C': -0.099},
  6: {'A': -0.003, 'T': -0.151, 'G': -0.031, 'C': 0.346},
  7: {'A': -0.151, 'T': -0.159, 'G': 0.232, 'C': 0.468},
  8: {'A': 1.343, 'T': -0.09, 'G': -0.105, 'C': -0.075},
  9: {'A': -0.09, 'T': 1.418, 'G': -0.075, 'C': -0.099},
  10: {'A': -0.09, 'T': -0.13, 'G': 1.966, 'C': -0.075},
  11: {'A': 0.034, 'T': -0.151, 'G': 0.533, 'C': -0.06},
  12: {'A': -0.096, 'T': -0.159, 'G': 0.18, 'C': 0.288},
  13: {'A': -0.068, 'T': -0.096, 'G': 0.13, 'C': 0.084}},
 5.83)


pd.DataFrame.from_dict(pwm)


ICDF = pd.DataFrame.from_dict(ic_tab)


ICDF.to_html()

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>1</th>\n      <th>2</th>\n      <th>3</th>\n      <th>4</th>\n      <th>5</th>\n      <th>6</th>\n      <th>7</th>\n      <th>8</th>\n      <th>9</th>\n      <th>10</th>\n      <th>11</th>\n      <th>12</th>\n      <th>13</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>A</th>\n      <td>-0.096</td>\n      <td>-0.096</td>\n      <td>-0.119</td>\n      <td>-0.096</td>\n      <td>0.259</td>\n      <td>-0.003</td>\n      <td>-0.151</td>\n      <td>1.343</td>\n      <td>-0.090</td>\n      <td>-0.090</td>\n      <td>0.034</td>\n      <td>-0.096</td>\n      <td>-0.068</td>\n    </tr>\n    <tr>\n      <th>T</th>\n      <td>-0.119</td>\n      <td>-0.159</td>\n      <td>-0.151</td>\n      <td>-0.119</td>\n      <td>-0.119</td>\n      <td>-0.151</td>\n      <td>-0.159</td>\n      <td>-0.090</td>\n      <td>1.418</td>\n      <td>-0.130</td>\n      <td>-0.151</td>\n      <td>-0.159</td>\n      <td>-0.096</td>\n    </tr>\n    <tr>\n      <th>G</th>\n      <td>0.232</td>\n      <td>0.232</td>\n      <td>0.232</td>\n      <td>0.003</td>\n      <td>0.084</td>\n      <td>-0.031</td>\n      <td>0.232</td>\n      <td>-0.105</td>\n      <td>-0.075</td>\n      <td>1.966</td>\n      <td>0.533</td>\n      <td>0.180</td>\n      <td>0.130</td>\n    </tr>\n    <tr>\n      <th>C</th>\n      <td>0.084</td>\n      <td>0.232</td>\n      <td>0.232</td>\n      <td>0.346</td>\n      <td>-0.099</td>\n      <td>0.346</td>\n      <td>0.468</td>\n      <td>-0.075</td>\n      <td>-0.099</td>\n      <td>-0.075</td>\n      <td>-0.060</td>\n      <td>0.288</td>\n      <td>0.084</td>\n    </tr>\n  </tbody>\n</table>'


d = pd.DataFrame.from_dict(icbj)
d


Чем больше IC(j), тем больше частоты букв в
колонке отличаются от ожидаемых, тем больше
информации в колонке


with open ('test.txt', 'r') as f, open ('res_test.txt', 'w') as res_t:
    c = 0
    rtest_ar = []
    for line in f:
        line = line.strip()
        res = 0
        c+=1
        for pos,ch in enumerate(line):
            res += w.at[ch, pos+1]
#             print (w.at[ch, pos+1], sep = '', end = ' ')
#         print ('\n', res)
        print (round(res,2), file = res_t)  
        rtest_ar.append(round(res,2))
ar = np.asarray(rtest_ar)
ar
# для каждой позиции буквы в строчке - найти ее в столбце индекса таблицы и для ее номера найти число. просуммировать в рез

array([-7.4,  2.4,  4.8,  4.1,  5.4,  3.9,  4.1,  2.7,  3.1,  3.7,  2.7,
        1.5,  5.2,  5.7, -4.2,  2.5,  4.9,  7.5,  6.7,  4.6,  4.3,  4.5,
        3.7,  5.1,  6.4,  6.4,  6.2,  4.9,  3.1,  3.8,  3.5,  4.5,  5.3,
        2.7,  5. ,  3.4,  5.2,  0.3, -0.8,  5.5,  6.5,  5.4, -1.8,  2.7,
        0.8,  2.9,  4.7,  3.2, -0.7,  5.8,  4.8, -1.6,  3.4,  5.2,  3.6,
       -2.1,  7.8, -4.7,  4.5,  2.9])


plt.boxplot(ar)

{'whiskers': [<matplotlib.lines.Line2D at 0x7f9a24f8e7b8>,
  <matplotlib.lines.Line2D at 0x7f9a24f8e160>],
 'caps': [<matplotlib.lines.Line2D at 0x7f9a24f8c978>,
  <matplotlib.lines.Line2D at 0x7f9a24f8c4a8>],
 'boxes': [<matplotlib.lines.Line2D at 0x7f9a24f8e860>],
 'medians': [<matplotlib.lines.Line2D at 0x7f9a24f8bf98>],
 'fliers': [<matplotlib.lines.Line2D at 0x7f9a24f8bcc0>],
 'means': []}


df = pd.read_csv('human-genes.tsv', '\t')
df2 = df[(df['#chrom'] == 'chrX') & (df['strand'] == '+')].iloc[0:150]
df2 = df2[df2.len > 1000]
# В таблице координаты начала первого кодона ATG указаны как Thickstart если на прямой цепи

df2.shape

(144, 24)


# df3 = df2.drop([df2.len < 1000].index)
# # df = df.drop([df2.score < 50].index)


df2['coords'] = (df2['thickStart'] + 188).astype(str).str.cat((df2['thickStart']+200).astype(str), sep ="..")

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

42727          284375..284387
42731          631085..631097
42733        1282891..1282903
42734        1341953..1341965
42739        1593650..1593662
                 ...         
44236    155774059..155774071
44237    155889654..155889666
44238    155997931..155997943
44239    155997947..155997959
44240    156021148..156021160
Name: coords, Length: 673, dtype: object

df2


сhrN = 'X'
strand = str(1)
strand_df = '+'
expand_3prime = 0
expand_5prime = 0


with open ('negative.txt', 'w') as neg:
    c = 0
    for coord in df2['coords']:
        ext = f"/sequence/region/human/{chrN}:{coord}:{strain}?expand_3prime={expand_3prime};expand_5prime={expand_5prime}"
        r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
        if not r.ok:
            r.raise_for_status()
            sys.exit()
        t = r.text.split()[1]
#         print (t)
        if 'ATG' in t:
            print(r.text.split()[1], file = neg)

df2


df2 = df[(df['#chrom'] == 'chrX') & (df['strand'] == '+') & (df['len'] >= 1000)].reset_index()
df2['coords'] = (df2['thickStart'] + 188).astype(str).str.cat((df2['thickStart']+200).astype(str), sep ="..")
df2.at[0,'coords']
#  & df['len'] > 1000

'284375..284387'


c = 0
with open ('negative.txt', 'w') as neg:
    с = 0
    i = 0
    while c < 80:
#         for coord in df2['coords']:
        coord = df2.at[i,'coords']
        i += 1
        ext = f"/sequence/region/human/{chrN}:{coord}:{strain}?expand_3prime={expand_3prime};expand_5prime={expand_5prime}"
        r = requests.get(server+ext, headers={ "Content-Type" : "text/x-fasta"})
        if not r.ok:
            r.raise_for_status()
            sys.exit()
        t = r.text.split()[1]
#         print (t)
        if 'ATG' in t:
            print(r.text.split()[1], file = neg)
            c+=1
            print ('ATG:', c, end = '...')
        print (i, end = '...')

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3079             try:
-> 3080                 return self._engine.get_loc(casted_key)
   3081             except KeyError as err:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'coords'

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-338-1f2171834f06> in <module>
      5     while c < 80:
      6 #         for coord in df2['coords']:
----> 7         coord = df2.at[i,'coords']
      8         i += 1
      9         ext = f"/sequence/region/human/{chrN}:{coord}:{strain}?expand_3prime={expand_3prime};expand_5prime={expand_5prime}"

/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in __getitem__(self, key)
   2154             return self.obj.loc[key]
   2155 
-> 2156         return super().__getitem__(key)
   2157 
   2158     def __setitem__(self, key, value):

/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in __getitem__(self, key)
   2101 
   2102         key = self._convert_key(key)
-> 2103         return self.obj._get_value(*key, takeable=self._takeable)
   2104 
   2105     def __setitem__(self, key, value):

/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in _get_value(self, index, col, takeable)
   3127             return series._values[index]
   3128 
-> 3129         series = self._get_item_cache(col)
   3130         engine = self.index._engine
   3131 

/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in _get_item_cache(self, item)
   3790             #  pending resolution of GH#33047
   3791 
-> 3792             loc = self.columns.get_loc(item)
   3793             values = self._mgr.iget(loc)
   3794             res = self._box_col_values(values, loc).__finalize__(self)

/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3080                 return self._engine.get_loc(casted_key)
   3081             except KeyError as err:
-> 3082                 raise KeyError(key) from err
   3083 
   3084         if tolerance is not None:

KeyError: 'coords'


# отобрала 60 из этих


with open ('negative.txt', 'r') as f, open ('res_negative.txt', 'w') as res_t:
    c = 0
    rtest_ar = []
    for line in f:
        line = line.strip()
        res = 0
        c+=1
        for pos,ch in enumerate(line):
            res += w.at[ch, pos+1]
            print (res)
#             print (w.at[ch, pos+1], sep = '', end = ' ')
#         print ('\n', res)
#         print (round(res,2), file = res_t)  
        rtest_ar.append(round(res,2))
ar_neg = np.asarray(rtest_ar)
ar_neg
# для каждой позиции буквы в строчке - найти ее в столбце индекса таблицы и для ее номера найти число. просуммировать в рез

0.5
-0.19999999999999996
-0.49999999999999994
-0.8999999999999999
-1.0999999999999999
-1.7999999999999998
-0.6999999999999997
-2.0999999999999996
-0.5999999999999996
-2.0999999999999996
-1.4999999999999996
-1.2999999999999996

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3079             try:
-> 3080                 return self._engine.get_loc(casted_key)
   3081             except KeyError as err:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()

KeyError: 13

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
<ipython-input-337-100c51d7f38e> in <module>
      7         c+=1
      8         for pos,ch in enumerate(line):
----> 9             res += w.at[ch, pos+1]
     10             print (res)
     11 #             print (w.at[ch, pos+1], sep = '', end = ' ')

/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in __getitem__(self, key)
   2154             return self.obj.loc[key]
   2155 
-> 2156         return super().__getitem__(key)
   2157 
   2158     def __setitem__(self, key, value):

/usr/local/lib/python3.7/dist-packages/pandas/core/indexing.py in __getitem__(self, key)
   2101 
   2102         key = self._convert_key(key)
-> 2103         return self.obj._get_value(*key, takeable=self._takeable)
   2104 
   2105     def __setitem__(self, key, value):

/usr/local/lib/python3.7/dist-packages/pandas/core/frame.py in _get_value(self, index, col, takeable)
   3127             return series._values[index]
   3128 
-> 3129         series = self._get_item_cache(col)
   3130         engine = self.index._engine
   3131 

/usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in _get_item_cache(self, item)
   3790             #  pending resolution of GH#33047
   3791 
-> 3792             loc = self.columns.get_loc(item)
   3793             values = self._mgr.iget(loc)
   3794             res = self._box_col_values(values, loc).__finalize__(self)

/usr/local/lib/python3.7/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   3080                 return self._engine.get_loc(casted_key)
   3081             except KeyError as err:
-> 3082                 raise KeyError(key) from err
   3083 
   3084         if tolerance is not None:

KeyError: 13


data_2d=[ar,ar_neg]

plt.boxplot(data_2d)
plt.title("Сигнал в геноме: ATG")
plt.show()


a = '40817810538111888575'
len(a)

20

	#chrom	chromStart	chromEnd	name	strand	len	thickStart	thickEnd	reserved	len-thick	...	exonFrames	type	geneName	geneName2	transcriptClass	source	transcriptType	tag	level	tier
42727	chrX	276321	303353	ENST00000399012.6	+	27033	284187	299335	789624	15149	...	-1,-1,0,1,0,0,0,1,	none	PLCXD1	Q9NUJ7	coding	havana_homo_sapiens	protein_coding	CCDS,PAR,appris_principal_1,basic	2	canonical,basic,all,all
42731	chrX	624343	646823	ENST00000381578.6	+	22481	630897	644636	789624	13740	...	-1,0,1,0,1,0,	none	SHOX	O15266	coding	havana_homo_sapiens	protein_coding	CCDS,PAR,appris_principal_1,basic	2	canonical,basic,all,all
42733	chrX	1268799	1309935	ENST00000417535.7	+	41137	1282703	1309479	789624	26777	...	-1,-1,0,1,0,1,2,1,0,0,1,1,2,0,	none	CSF2RA	P15509	coding	ensembl_homo_sapiens	protein_coding	CCDS,PAR,appris_alternative_2,basic,ncRNA_host	3	basic,all,all
42734	chrX	1336615	1382689	ENST00000381469.7	+	46075	1341765	1382465	789624	40701	...	-1,0,1,2,1,0,0,1,2,0,	none	IL3RA	P26951	coding	ensembl_homo_sapiens	protein_coding	CCDS,PAR,basic	3	basic,all,all
42739	chrX	1591603	1602520	ENST00000313871.9	+	10918	1593462	1601594	789624	8133	...	-1,0,0,2,0,	none	AKAP17A	Q02040	coding	ensembl_havana_transcript_homo_sapiens	protein_coding	CCDS,Ensembl_canonical,MANE_Select,PAR,appris_...	2	canonical,basic,all,all
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
44236	chrX	155612587	155776795	ENST00000675360.1	+	164209	155773871	155774738	789624	868	...	-1,-1,-1,0,	none	SPRY3	O43610	coding	havana_homo_sapiens	protein_coding	CCDS,Ensembl_canonical,RNA_Seq_supported_only,...	2	canonical,basic,all
44237	chrX	155881344	155943769	ENST00000286448.12	+	62426	155889466	155941951	789624	52486	...	-1,0,2,0,0,1,0,0,	none	VAMP7	P51809	coding	ensembl_havana_transcript_homo_sapiens	protein_coding	CCDS,Ensembl_canonical,MANE_Select,PAR,appris_...	2	canonical,basic,all,all
44238	chrX	155997705	156010817	ENST00000369423.7	+	13113	155997743	156009852	789624	12110	...	0,2,1,1,2,0,0,1,2,	none	IL9R	Q01113	coding	ensembl_havana_transcript_homo_sapiens	protein_coding	CCDS,PAR,basic,overlapping_locus	2	basic,all,all
44239	chrX	155997695	156010817	ENST00000244174.11	+	13123	155997759	156010409	789624	12651	...	0,1,1,2,1,0,1,2,0,	none	IL9R	Q01113	coding	ensembl_havana_transcript_homo_sapiens	protein_coding	CCDS,Ensembl_canonical,MANE_Select,PAR,appris_...	2	canonical,basic,all,all
44240	chrX	156020960	156025374	ENST00000359512.8	+	4415	156020960	156025374	789624	4415	...	0,2,0,0,2,0,1,1,1,1,	none	WASH6P	A0A7I2YQU6	coding	havana_homo_sapiens	protein_coding	Ensembl_canonical,PAR,appris_principal_1,basic...	2	canonical,basic,all,all

	1	2	3	4	5	6	7	8	9	10	11	12	13
A	-0.2916	-0.2916	-0.4060	-0.2916	0.3879	-0.0107	-0.6837	1.0462	-2.2931	-2.2931	0.0678	-0.2916	-0.1890
T	-0.4060	-0.8580	-0.6837	-0.4060	-0.4060	-0.6837	-1.0693	-2.2931	1.0742	-1.7053	-1.3376	-0.8580	-0.2916
G	0.4899	0.4899	0.4899	0.0161	0.2332	-0.1131	0.4899	-0.9154	-1.8709	1.4964	0.8101	0.4114	0.3263
C	0.2332	0.4899	0.4899	0.6305	-0.6472	0.6305	0.7537	-1.8709	-1.2832	-1.8709	-0.2615	0.5627	0.2332

	1	2	3	4	5	6	7	8	9	10	11	12	13
A	-0.065	-0.157	-0.065	-0.092	0.123	-0.065	-0.135	1.742	0.000	0.000	0.001	-0.065	-0.157
T	-0.065	-0.092	-0.149	-0.150	-0.158	-0.116	0.000	0.000	1.742	0.000	-0.116	-0.157	0.039
G	0.037	0.280	0.124	0.037	0.397	-0.063	0.280	0.000	0.000	2.315	0.337	0.124	0.280
C	0.124	0.173	0.225	0.589	-0.086	0.397	0.589	0.000	0.000	0.000	-0.086	0.280	-0.001

	#chrom	chromStart	chromEnd	name	strand	len	thickStart	thickEnd	reserved	len-thick	...	type	geneName	geneName2	transcriptClass	source	transcriptType	tag	level	tier	coords
42727	chrX	276321	303353	ENST00000399012.6	+	27033	284187	299335	789624	15149	...	none	PLCXD1	Q9NUJ7	coding	havana_homo_sapiens	protein_coding	CCDS,PAR,appris_principal_1,basic	2	canonical,basic,all,all	284375..284387
42731	chrX	624343	646823	ENST00000381578.6	+	22481	630897	644636	789624	13740	...	none	SHOX	O15266	coding	havana_homo_sapiens	protein_coding	CCDS,PAR,appris_principal_1,basic	2	canonical,basic,all,all	631085..631097
42733	chrX	1268799	1309935	ENST00000417535.7	+	41137	1282703	1309479	789624	26777	...	none	CSF2RA	P15509	coding	ensembl_homo_sapiens	protein_coding	CCDS,PAR,appris_alternative_2,basic,ncRNA_host	3	basic,all,all	1282891..1282903
42734	chrX	1336615	1382689	ENST00000381469.7	+	46075	1341765	1382465	789624	40701	...	none	IL3RA	P26951	coding	ensembl_homo_sapiens	protein_coding	CCDS,PAR,basic	3	basic,all,all	1341953..1341965
42739	chrX	1591603	1602520	ENST00000313871.9	+	10918	1593462	1601594	789624	8133	...	none	AKAP17A	Q02040	coding	ensembl_havana_transcript_homo_sapiens	protein_coding	CCDS,Ensembl_canonical,MANE_Select,PAR,appris_...	2	canonical,basic,all,all	1593650..1593662
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
44236	chrX	155612587	155776795	ENST00000675360.1	+	164209	155773871	155774738	789624	868	...	none	SPRY3	O43610	coding	havana_homo_sapiens	protein_coding	CCDS,Ensembl_canonical,RNA_Seq_supported_only,...	2	canonical,basic,all	155774059..155774071
44237	chrX	155881344	155943769	ENST00000286448.12	+	62426	155889466	155941951	789624	52486	...	none	VAMP7	P51809	coding	ensembl_havana_transcript_homo_sapiens	protein_coding	CCDS,Ensembl_canonical,MANE_Select,PAR,appris_...	2	canonical,basic,all,all	155889654..155889666
44238	chrX	155997705	156010817	ENST00000369423.7	+	13113	155997743	156009852	789624	12110	...	none	IL9R	Q01113	coding	ensembl_havana_transcript_homo_sapiens	protein_coding	CCDS,PAR,basic,overlapping_locus	2	basic,all,all	155997931..155997943
44239	chrX	155997695	156010817	ENST00000244174.11	+	13123	155997759	156010409	789624	12651	...	none	IL9R	Q01113	coding	ensembl_havana_transcript_homo_sapiens	protein_coding	CCDS,Ensembl_canonical,MANE_Select,PAR,appris_...	2	canonical,basic,all,all	155997947..155997959
44240	chrX	156020960	156025374	ENST00000359512.8	+	4415	156020960	156025374	789624	4415	...	none	WASH6P	A0A7I2YQU6	coding	havana_homo_sapiens	protein_coding	Ensembl_canonical,PAR,appris_principal_1,basic...	2	canonical,basic,all,all	156021148..156021160

	#chrom	chromStart	chromEnd	name	strand	len	thickStart	thickEnd	reserved	len-thick	...	type	geneName	geneName2	transcriptClass	source	transcriptType	tag	level	tier	coords
42727	chrX	276321	303353	ENST00000399012.6	+	27033	284187	299335	789624	15149	...	none	PLCXD1	Q9NUJ7	coding	havana_homo_sapiens	protein_coding	CCDS,PAR,appris_principal_1,basic	2	canonical,basic,all,all	284375..284387
42731	chrX	624343	646823	ENST00000381578.6	+	22481	630897	644636	789624	13740	...	none	SHOX	O15266	coding	havana_homo_sapiens	protein_coding	CCDS,PAR,appris_principal_1,basic	2	canonical,basic,all,all	631085..631097
42733	chrX	1268799	1309935	ENST00000417535.7	+	41137	1282703	1309479	789624	26777	...	none	CSF2RA	P15509	coding	ensembl_homo_sapiens	protein_coding	CCDS,PAR,appris_alternative_2,basic,ncRNA_host	3	basic,all,all	1282891..1282903
42734	chrX	1336615	1382689	ENST00000381469.7	+	46075	1341765	1382465	789624	40701	...	none	IL3RA	P26951	coding	ensembl_homo_sapiens	protein_coding	CCDS,PAR,basic	3	basic,all,all	1341953..1341965
42739	chrX	1591603	1602520	ENST00000313871.9	+	10918	1593462	1601594	789624	8133	...	none	AKAP17A	Q02040	coding	ensembl_havana_transcript_homo_sapiens	protein_coding	CCDS,Ensembl_canonical,MANE_Select,PAR,appris_...	2	canonical,basic,all,all	1593650..1593662
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
44236	chrX	155612587	155776795	ENST00000675360.1	+	164209	155773871	155774738	789624	868	...	none	SPRY3	O43610	coding	havana_homo_sapiens	protein_coding	CCDS,Ensembl_canonical,RNA_Seq_supported_only,...	2	canonical,basic,all	155774059..155774071
44237	chrX	155881344	155943769	ENST00000286448.12	+	62426	155889466	155941951	789624	52486	...	none	VAMP7	P51809	coding	ensembl_havana_transcript_homo_sapiens	protein_coding	CCDS,Ensembl_canonical,MANE_Select,PAR,appris_...	2	canonical,basic,all,all	155889654..155889666
44238	chrX	155997705	156010817	ENST00000369423.7	+	13113	155997743	156009852	789624	12110	...	none	IL9R	Q01113	coding	ensembl_havana_transcript_homo_sapiens	protein_coding	CCDS,PAR,basic,overlapping_locus	2	basic,all,all	155997931..155997943
44239	chrX	155997695	156010817	ENST00000244174.11	+	13123	155997759	156010409	789624	12651	...	none	IL9R	Q01113	coding	ensembl_havana_transcript_homo_sapiens	protein_coding	CCDS,Ensembl_canonical,MANE_Select,PAR,appris_...	2	canonical,basic,all,all	155997947..155997959
44240	chrX	156020960	156025374	ENST00000359512.8	+	4415	156020960	156025374	789624	4415	...	none	WASH6P	A0A7I2YQU6	coding	havana_homo_sapiens	protein_coding	Ensembl_canonical,PAR,appris_principal_1,basic...	2	canonical,basic,all,all	156021148..156021160

Выбор хромосомы и прочтение таблицы с генами¶

Создание файлов с последовательностями для train, test, negative¶

Создание PWM для train¶

TEST </H3>