import pandas as pd
import itertools as it
import io
import numpy as np
np.random.seed(100)
with open('list_of_operons_169730.txt','r') as file1:
file1.readline()
lst = file1.readlines()
grouper = it.groupby( lst,lambda x: (x[0]) in [str(i) for i in range(0,10)])
lst_of_oper = []
dct_oper = dict()
counter = 0
for i,j in grouper:
if i:
counter += 1
dct_oper[int(list(j)[0][:-1])] = None
else:
dct_oper[counter] = pd.read_csv(io.StringIO(''.join(list(j))), sep='\t', header=None).iloc[:,1:]
dct_oper[2]
1 | 2 | 3 | 4 | 5 | 6 | 7 | |
---|---|---|---|---|---|---|---|
0 | IBLOPNHP_00003 | CDS | COG2501 | 3206 | 3421 | + | [S] Uncharacterized conserved protein |
1 | IBLOPNHP_00004 | CDS | COG1195 | 3437 | 4549 | + | [L] Recombinational DNA repair ATPase (RecF pa... |
2 | IBLOPNHP_00005 | CDS | ROG0244 | 4567 | 4812 | + | NaN |
3 | IBLOPNHP_00006 | CDS | COG0187 | 4861 | 6783 | + | [L] Type IIA topoisomerase (DNA gyrase/topo II, |
4 | IBLOPNHP_00007 | CDS | COG0188 | 6994 | 9459 | + | [L] Type IIA topoisomerase (DNA gyrase/topo II, |
np.random.seed(100)
#train
train_set = []
random_ind = (list(range(1,len(dct_oper.keys())+1)))
np.random.shuffle(random_ind)
key_words = ['topoisomerase', 'DNA', 'RNA', 'Ribosomal', 'replication', 'Adenin']
train_set = []
for ind in random_ind:
if dct_oper[ind].loc[0,6] == '+':
table = dct_oper[ind]# лень писать для другой цепи
coord = table.loc[0,4]
counter_new = 0
for row in table.itertuples():
if counter_new != 1 and any(word in str(row[7]) for word in key_words):
train_set.append(coord)
counter_new = 1
else:
break
if len(train_set) == 50:
break
train_set
[938731, 1933477, 406131, 1322915, 135364, 3035730, 1162267, 1717933, 20880, 1324471, 121919, 2225337, 511157, 194849, 583589, 713664, 45633, 2769850, 1752278, 1915221, 737603, 1461770, 991414, 898961, 571389, 1613357, 2635815, 1521351, 2284771, 251427, 2652993, 816113, 872425, 1863448, 2549775, 655223, 3257693, 1866389, 728732, 290915, 26814, 935452, 1319011, 410, 739878, 1166743, 965909, 876426, 3383565, 3947161]
train_coords = [[i-100,i] for i in train_set]
train_coords
[[938631, 938731], [1933377, 1933477], [406031, 406131], [1322815, 1322915], [135264, 135364], [3035630, 3035730], [1162167, 1162267], [1717833, 1717933], [20780, 20880], [1324371, 1324471], [121819, 121919], [2225237, 2225337], [511057, 511157], [194749, 194849], [583489, 583589], [713564, 713664], [45533, 45633], [2769750, 2769850], [1752178, 1752278], [1915121, 1915221], [737503, 737603], [1461670, 1461770], [991314, 991414], [898861, 898961], [571289, 571389], [1613257, 1613357], [2635715, 2635815], [1521251, 1521351], [2284671, 2284771], [251327, 251427], [2652893, 2652993], [816013, 816113], [872325, 872425], [1863348, 1863448], [2549675, 2549775], [655123, 655223], [3257593, 3257693], [1866289, 1866389], [728632, 728732], [290815, 290915], [26714, 26814], [935352, 935452], [1318911, 1319011], [310, 410], [739778, 739878], [1166643, 1166743], [965809, 965909], [876326, 876426], [3383465, 3383565], [3947061, 3947161]]
np.random.seed(100)
#test
random_ind = (list(range(1,len(dct_oper.keys())+1)))
np.random.shuffle(random_ind)
test_coords = []
for ind in random_ind:
if dct_oper[ind].loc[0,6] == '+':
table = dct_oper[ind]# лень писать для другой цепи
coord = table.loc[0,4]
counter_new = 0
for row in table.itertuples():
if counter_new != 1:
test_coords.append([row[4]-100,row[4]]) if all([ (row[4]-100<m or row[4]-100>n) for m,n in train_coords]) and all([ (row[4]<m or row[4]>n) for m,n in train_coords]) else None
counter_new = 1
else:
break
if len(test_coords) == 50:
break
test_coords
[[2043938, 2044038], [314783, 314883], [3075312, 3075412], [3991618, 3991718], [4169945, 4170045], [2019697, 2019797], [3155625, 3155725], [2466621, 2466721], [203629, 203729], [986886, 986986], [1751101, 1751201], [505052, 505152], [2028754, 2028854], [1083751, 1083851], [1211377, 1211477], [1414897, 1414997], [456923, 457023], [3421672, 3421772], [2148576, 2148676], [3471166, 3471266], [504589, 504689], [798369, 798469], [1926580, 1926680], [976469, 976569], [1153689, 1153789], [2119107, 2119207], [2596435, 2596535], [3066351, 3066451], [1891808, 1891908], [56252, 56352], [581603, 581703], [2035939, 2036039], [1537341, 1537441], [2473051, 2473151], [494406, 494506], [2802258, 2802358], [1481447, 1481547], [696095, 696195], [753717, 753817], [3422254, 3422354], [1314353, 1314453], [4167010, 4167110], [2152827, 2152927], [3723354, 3723454], [3939769, 3939869], [3108514, 3108614], [792582, 792682], [1157137, 1157237], [875328, 875428], [3707044, 3707144]]
np.random.seed(100)
#neg, - цепь можно не проверять, т.к. буду брать посл-ти только с + цепи
anti_neg = []
for ind in dct_oper.keys():
if dct_oper[ind].loc[0,6] == '+':
table = dct_oper[ind]
coord = table.loc[0,4]
counter_new = 0
for row in table.itertuples():
if counter_new != 1:
anti_neg.append(coord)
counter_new = 1
else:
break
len(anti_neg)
1094
with open('bacsu_genome.fasta', 'r') as file1:
genome = ''.join(string[:-1] for string in file1.readlines() if string[0]!='>' )
np.random.seed(100)
anti_neg_coords = [[i-100,i] for i in anti_neg]
neg_contr_coords = []
lst_coords = np.random.randint(1, len(genome)-100, size=1000) #должно хватить
for i in lst_coords:
neg_contr_coords.append([i-100,i]) if all([ (i-100<m or i-100>n) for m,n in anti_neg_coords]) and all([ (i<m or i>n) for m,n in anti_neg_coords]) else None
if len(neg_contr_coords) == 50:
break
len(neg_contr_coords)
50
def write_fasta(mode:str, lst_coords:list):
great_str = ''
counter = 1
for i,j in lst_coords:
great_str += f'>seq{counter}_BACSU\n'
great_str += f'{genome[i:i+50]}\n{genome[i+50:j]}\n'
counter += 1
with open(f'{mode}.fa', 'w') as file2:
print(great_str, file=file2)
write_fasta('train',train_coords)
write_fasta('test',test_coords)
write_fasta('neg_contr',neg_contr_coords)
!jupyter nbconvert --to html pr7.ipynb