In [32]:
import pandas as pd 
import itertools as it 
import io
import numpy as np
In [165]:
np.random.seed(100)
with open('list_of_operons_169730.txt','r') as file1:
    file1.readline()
    lst = file1.readlines()
grouper = it.groupby( lst,lambda x: (x[0]) in [str(i) for i in range(0,10)])

lst_of_oper = []
dct_oper = dict()
counter = 0
for i,j in grouper:
    if i:
        counter += 1 
        dct_oper[int(list(j)[0][:-1])] = None 
    else:
        dct_oper[counter] = pd.read_csv(io.StringIO(''.join(list(j))), sep='\t',  header=None).iloc[:,1:]
dct_oper[2]        
Out[165]:
1 2 3 4 5 6 7
0 IBLOPNHP_00003 CDS COG2501 3206 3421 + [S] Uncharacterized conserved protein
1 IBLOPNHP_00004 CDS COG1195 3437 4549 + [L] Recombinational DNA repair ATPase (RecF pa...
2 IBLOPNHP_00005 CDS ROG0244 4567 4812 + NaN
3 IBLOPNHP_00006 CDS COG0187 4861 6783 + [L] Type IIA topoisomerase (DNA gyrase/topo II,
4 IBLOPNHP_00007 CDS COG0188 6994 9459 + [L] Type IIA topoisomerase (DNA gyrase/topo II,
In [166]:
np.random.seed(100)
#train
train_set = []
random_ind = (list(range(1,len(dct_oper.keys())+1)))
np.random.shuffle(random_ind)
key_words = ['topoisomerase', 'DNA', 'RNA', 'Ribosomal', 'replication', 'Adenin']
train_set = []
for ind in random_ind:
    if dct_oper[ind].loc[0,6] == '+':
        table = dct_oper[ind]# лень писать для другой цепи
        coord = table.loc[0,4]
        counter_new = 0
        for row in table.itertuples():
            if counter_new != 1 and any(word in str(row[7]) for word in key_words):
                train_set.append(coord) 
                counter_new = 1
            else:
                break    
        if len(train_set) == 50:
            break

train_set
Out[166]:
[938731,
 1933477,
 406131,
 1322915,
 135364,
 3035730,
 1162267,
 1717933,
 20880,
 1324471,
 121919,
 2225337,
 511157,
 194849,
 583589,
 713664,
 45633,
 2769850,
 1752278,
 1915221,
 737603,
 1461770,
 991414,
 898961,
 571389,
 1613357,
 2635815,
 1521351,
 2284771,
 251427,
 2652993,
 816113,
 872425,
 1863448,
 2549775,
 655223,
 3257693,
 1866389,
 728732,
 290915,
 26814,
 935452,
 1319011,
 410,
 739878,
 1166743,
 965909,
 876426,
 3383565,
 3947161]
In [167]:
train_coords = [[i-100,i] for i in train_set]
train_coords
Out[167]:
[[938631, 938731],
 [1933377, 1933477],
 [406031, 406131],
 [1322815, 1322915],
 [135264, 135364],
 [3035630, 3035730],
 [1162167, 1162267],
 [1717833, 1717933],
 [20780, 20880],
 [1324371, 1324471],
 [121819, 121919],
 [2225237, 2225337],
 [511057, 511157],
 [194749, 194849],
 [583489, 583589],
 [713564, 713664],
 [45533, 45633],
 [2769750, 2769850],
 [1752178, 1752278],
 [1915121, 1915221],
 [737503, 737603],
 [1461670, 1461770],
 [991314, 991414],
 [898861, 898961],
 [571289, 571389],
 [1613257, 1613357],
 [2635715, 2635815],
 [1521251, 1521351],
 [2284671, 2284771],
 [251327, 251427],
 [2652893, 2652993],
 [816013, 816113],
 [872325, 872425],
 [1863348, 1863448],
 [2549675, 2549775],
 [655123, 655223],
 [3257593, 3257693],
 [1866289, 1866389],
 [728632, 728732],
 [290815, 290915],
 [26714, 26814],
 [935352, 935452],
 [1318911, 1319011],
 [310, 410],
 [739778, 739878],
 [1166643, 1166743],
 [965809, 965909],
 [876326, 876426],
 [3383465, 3383565],
 [3947061, 3947161]]
In [168]:
np.random.seed(100)
#test
random_ind = (list(range(1,len(dct_oper.keys())+1)))
np.random.shuffle(random_ind)
test_coords = []
for ind in random_ind:
    if dct_oper[ind].loc[0,6] == '+':
        table = dct_oper[ind]# лень писать для другой цепи
        coord = table.loc[0,4]
        counter_new = 0
        for row in table.itertuples():
            if counter_new != 1:
                test_coords.append([row[4]-100,row[4]]) if all([ (row[4]-100<m or row[4]-100>n) for m,n in train_coords])  and all([ (row[4]<m or row[4]>n) for m,n in train_coords]) else None 
                counter_new = 1
            else:
                break    
            
    if len(test_coords) == 50:
        break
test_coords
Out[168]:
[[2043938, 2044038],
 [314783, 314883],
 [3075312, 3075412],
 [3991618, 3991718],
 [4169945, 4170045],
 [2019697, 2019797],
 [3155625, 3155725],
 [2466621, 2466721],
 [203629, 203729],
 [986886, 986986],
 [1751101, 1751201],
 [505052, 505152],
 [2028754, 2028854],
 [1083751, 1083851],
 [1211377, 1211477],
 [1414897, 1414997],
 [456923, 457023],
 [3421672, 3421772],
 [2148576, 2148676],
 [3471166, 3471266],
 [504589, 504689],
 [798369, 798469],
 [1926580, 1926680],
 [976469, 976569],
 [1153689, 1153789],
 [2119107, 2119207],
 [2596435, 2596535],
 [3066351, 3066451],
 [1891808, 1891908],
 [56252, 56352],
 [581603, 581703],
 [2035939, 2036039],
 [1537341, 1537441],
 [2473051, 2473151],
 [494406, 494506],
 [2802258, 2802358],
 [1481447, 1481547],
 [696095, 696195],
 [753717, 753817],
 [3422254, 3422354],
 [1314353, 1314453],
 [4167010, 4167110],
 [2152827, 2152927],
 [3723354, 3723454],
 [3939769, 3939869],
 [3108514, 3108614],
 [792582, 792682],
 [1157137, 1157237],
 [875328, 875428],
 [3707044, 3707144]]
In [169]:
np.random.seed(100)
#neg, - цепь можно не проверять, т.к. буду брать посл-ти только с + цепи
anti_neg = []

for ind in dct_oper.keys():
    if dct_oper[ind].loc[0,6] == '+':
        table = dct_oper[ind]
        coord = table.loc[0,4]
        counter_new = 0
        for row in table.itertuples():
            if counter_new != 1:
                anti_neg.append(coord) 
                counter_new = 1
            else:
                break    

len(anti_neg)
Out[169]:
1094
In [77]:
with open('bacsu_genome.fasta', 'r') as file1:
    genome = ''.join(string[:-1] for string in file1.readlines() if string[0]!='>' )    
In [170]:
np.random.seed(100)
anti_neg_coords = [[i-100,i] for i in anti_neg]
neg_contr_coords = []
lst_coords = np.random.randint(1, len(genome)-100, size=1000) #должно хватить
for i in lst_coords:
    neg_contr_coords.append([i-100,i]) if all([ (i-100<m or i-100>n) for m,n in anti_neg_coords])  and all([ (i<m or i>n) for m,n in anti_neg_coords]) else None
    
    if len(neg_contr_coords) == 50:
        break
     

len(neg_contr_coords)
Out[170]:
50
In [171]:
def write_fasta(mode:str, lst_coords:list):
    great_str = ''
    counter = 1
    for i,j in lst_coords:
        great_str += f'>seq{counter}_BACSU\n'
        great_str += f'{genome[i:i+50]}\n{genome[i+50:j]}\n'
        counter += 1
    with open(f'{mode}.fa', 'w') as file2:
        print(great_str, file=file2)
In [172]:
write_fasta('train',train_coords)
write_fasta('test',test_coords)
write_fasta('neg_contr',neg_contr_coords)
In [1]:
!jupyter nbconvert --to html pr7.ipynb