import pandas as pd
import numpy as np
from itertools import groupby
from collections import defaultdict
import re
!wget "http://biocomputo.ibt.unam.mx/operon_mapper/tmp/list_of_operons_3481957" #Скачиваю список оперонов бактерии Pseudomonas aeruginosa
alph = {"A": "T",
"T": "A",
"G": "C",
"C": "G"}
--2023-04-09 00:18:40-- http://biocomputo.ibt.unam.mx/operon_mapper/tmp/list_of_operons_3481957 Распознаётся biocomputo.ibt.unam.mx (biocomputo.ibt.unam.mx)… 132.248.32.18 Подключение к biocomputo.ibt.unam.mx (biocomputo.ibt.unam.mx)|132.248.32.18|:80... соединение установлено. HTTP-запрос отправлен. Ожидание ответа… 200 OK Длина: 473981 (463K) Сохранение в: «list_of_operons_3481957.2» list_of_operons_348 100%[===================>] 462,87K 168KB/s за 2,8s 2023-04-09 00:18:43 (168 KB/s) - «list_of_operons_3481957.2» сохранён [473981/473981]
with open("./list_of_operons_3481957", "r") as file:
file.readline()
text = list(map(lambda x: x.rstrip().split("\t"), file.readlines()))
Ind_operon = defaultdict(list) #В этом блоке создаётся словарь Ind_operon, в котором ключам (индексы оперонов) сопоставляются первые CDS в составе оперона.
for ind, line in groupby(text, key=lambda x: x[0] != ''):
if ind:
Ind_operon[next(line)[0]]
else:
key = list(Ind_operon.keys())[-1]
Ind_operon[key].append(list(line)[0])
for key, value in Ind_operon.items(): #Техническая операция распаковки values в словаре.
value = value[0]
Ind_operon[key] = value
Table = pd.DataFrame(Ind_operon).T
Table.columns = ['Operon', 'IdGene', 'Type', 'COGgene', 'PosLeft', 'postRight', 'Strand', 'Function']
Table.set_index('Operon', append=True)
Table.index = range(1, len(Table) + 1)
Table_with_F = Table.replace("NA", np.nan).dropna(axis=0, subset=['Function'])
Table_with_F #Tables - удобная таблица, составленная из списка оперонов. При этом отобраны только первые CDS с известной функцией (без NA в колонке Function).
Operon | IdGene | Type | COGgene | PosLeft | postRight | Strand | Function | |
---|---|---|---|---|---|---|---|---|
1 | dnaA | CDS | COG0593 | 483 | 2027 | + | [L] ATPase involved in DNA replication initiation | |
2 | lptA | CDS | COG0204 | 7018 | 7791 | - | [I] 1-acyl-sn-glycerol-3-phosphate acyltransfe... | |
4 | glyS | CDS | COG0751 | 10434 | 12488 | - | [J] Glycyl-tRNA synthetase, beta subunit | |
5 | tag | CDS | COG2818 | 13540 | 14091 | + | [L] 3-methyladenine DNA glycosylase | |
6 | cds-NP_248701.1 | CDS | COG1560 | 14235 | 15122 | + | [M] Lauroyl/myristoyl acyltransferase | |
... | ... | ... | ... | ... | ... | ... | ... | ... |
3002 | cds-NP_254235.1 | CDS | COG2814 | 6241851 | 6243056 | + | [G] Arabinose efflux permease | |
3003 | glmS | CDS | COG0449 | 6243111 | 6244946 | - | [M] Glucosamine 6-phosphate synthetase, contai... | |
3004 | cds-NP_254238.1 | CDS | COG0739 | 6245803 | 6246312 | - | [M] Membrane proteins related to metalloendope... | |
3005 | spoOJ | CDS | COG1475 | 6254972 | 6255844 | - | [K] Predicted transcriptional regulators | |
3007 | cds-NP_254254.1 | CDS | COG0486 | 6260390 | 6261757 | - | [R] Predicted GTPase |
2305 rows × 8 columns
mask = list(map(lambda x: bool(len(re.findall(r"[Tt]ranscription[\w\s]+[Ff]actor|[\s\w]*\[J\][\s\w]*|[\w\s]*[RNADNA][\w\s]polymerase|[\s\w]*\[C\][\s\w]*|[\s\w]*P450[\s\w]*|[\w\s]*ATPase[\w\s]*", x))), Table_with_F.Function))
Table_house = Table_with_F[mask] #Сложный отбор housekeeping генов с помощью языка re. Отбирал: 1) Транскрипционные факторы, 2) CDS с категорией [J] (Translation, ribosomal structure and biogenesis), 3) RNA и DNA полимеразы, 4) P450, 5) ATPазы.
Table_house.index = list(map(int, Table_house.index))
Table_house
Operon | IdGene | Type | COGgene | PosLeft | postRight | Strand | Function | |
---|---|---|---|---|---|---|---|---|
1 | dnaA | CDS | COG0593 | 483 | 2027 | + | [L] ATPase involved in DNA replication initiation | |
4 | glyS | CDS | COG0751 | 10434 | 12488 | - | [J] Glycyl-tRNA synthetase, beta subunit | |
34 | cds-NP_248744.1 | CDS | COG1859 | 69543 | 70091 | - | [J] RNA:NAD 2'-phosphotransferase | |
51 | coxB | CDS | COG1622 | 127378 | 128502 | + | [C] Heme/copper-type cytochrome/quinol oxidase... | |
57 | cds-NP_248809.1 | CDS | COG1301 | 138818 | 140167 | + | [C] Na+/H+-dicarboxylate symporters | |
... | ... | ... | ... | ... | ... | ... | ... | ... |
2954 | cds-NP_254157.1 | CDS | COG1186 | 6158949 | 6159563 | - | [J] Protein chain release factor B | |
2962 | gltP | CDS | COG1301 | 6170176 | 6171510 | - | [C] Na+/H+-dicarboxylate symporters | |
2967 | cc4 | CDS | COG2863 | 6181745 | 6182350 | - | [C] Cytochrome c553 | |
2969 | polA | CDS | COG0749 | 6183784 | 6186525 | - | [L] DNA polymerase I - 3'-5' exonuclease | |
2974 | cds-NP_254190.1 | CDS | COG1135 | 6195309 | 6196316 | + | [P] ABC-type metal ion transport system, ATPase |
257 rows × 8 columns
seed = 234
Train_table = Table_house.sample(60, random_state=seed) #Берём обучающую выборку из 40 оперонов.
Train_table
Operon | IdGene | Type | COGgene | PosLeft | postRight | Strand | Function | |
---|---|---|---|---|---|---|---|---|
2261 | cds-NP_252844.1 | CDS | COG2141 | 4648999 | 4650306 | - | [C] Coenzyme F420-dependent N5,N10-methylene t... | |
638 | napC | CDS | COG3005 | 1272200 | 1272796 | - | [C] Nitrate/TMAO reductases, membrane-bound te... | |
2128 | cds-NP_252608.1 | CDS | COG1875 | 4387336 | 4388727 | - | [T] Predicted ATPase related to phosphate star... | |
2877 | cds-NP_254012.1 | CDS | COG4313 | 5996036 | 5996986 | + | [C] Protein involved in meta-pathway of phenol | |
1460 | cds-NP_251406.1 | CDS | COG1902 | 3070719 | 3071954 | - | [C] NADH:flavin oxidoreductases, Old Yellow En... | |
2586 | selB | CDS | COG3276 | 5392523 | 5394448 | - | [J] Selenocysteine-specific translation elonga... | |
2170 | cds-NP_252669.1 | CDS | COG0621 | 4459879 | 4461219 | + | [J] 2-methylthioadenine synthetase | |
1050 | rimJ | CDS | COG1670 | 2108942 | 2109511 | - | [J] Acetyltransferases, including N-acetylases... | |
2245 | cds-NP_252820.1 | CDS | COG0348 | 4619314 | 4621035 | + | [C] Polyferredoxin | |
1017 | modC | CDS | COG4148 | 2020625 | 2021710 | - | [P] ABC-type molybdate transport system, ATPas... | |
105 | cds-NP_248893.1 | CDS | COG0154 | 230543 | 232000 | - | [J] Asp-tRNAAsn/Glu-tRNAGln amidotransferase A... | |
1970 | cds-NP_252304.1 | CDS | COG1236 | 4048519 | 4049922 | + | [J] Predicted exonuclease of the beta-lactamas... | |
684 | cds-NP_249943.1 | CDS | COG2055 | 1359590 | 1360594 | + | [C] Malate/L-lactate dehydrogenases | |
1277 | fpvI | CDS | COG1595 | 2640392 | 2640871 | - | [K] DNA-directed RNA polymerase specialized si... | |
2716 | cds-NP_253733.1 | CDS | COG0281 | 5683471 | 5684739 | - | [C] Malic enzyme | |
1261 | cds-NP_251042.1 | CDS | COG0584 | 2598700 | 2599827 | + | [C] Glycerophosphoryl diester phosphodiesterase | |
496 | dinB | CDS | COG0389 | 1007950 | 1008999 | + | [L] Nucleotidyltransferase/DNA polymerase invo... | |
1655 | cds-NP_251709.1 | CDS | COG0488 | 3379815 | 3381737 | - | [R] ATPase components of ABC transporters with | |
706 | rnd | CDS | COG0349 | 1405338 | 1406462 | + | [J] Ribonuclease D | |
2028 | cds-NP_252422.1 | CDS | COG1804 | 4183710 | 4184939 | + | [C] Predicted acyl-CoA transferases/carnitine ... | |
2123 | prfC | CDS | COG4108 | 4371969 | 4373552 | + | [J] Peptide chain release factor RF-3 | |
602 | cds-NP_249818.1 | CDS | COG0667 | 1219621 | 1220610 | - | [C] Predicted oxidoreductases (related to aryl... | |
2430 | pilB | CDS | COG2804 | 5069763 | 5071463 | + | [NU] Type II secretory pathway, ATPase PulE/Tfp | |
136 | gabD | CDS | COG1012 | 299522 | 300973 | + | [C] NAD-dependent aldehyde dehydrogenases | |
1632 | rne | CDS | COG1530 | 3332881 | 3336054 | + | [J] Ribonucleases G and E | |
2023 | cds-NP_252412.1 | CDS | COG1902 | 4167711 | 4168817 | - | [C] NADH:flavin oxidoreductases, Old Yellow En... | |
2036 | rplS | CDS | COG0335 | 4195008 | 4195358 | - | [J] Ribosomal protein L19 | |
2588 | fadH2 | CDS | COG1902 | 5403305 | 5405350 | + | [C] NADH:flavin oxidoreductases, Old Yellow En... | |
2755 | cds-NP_253813.1 | CDS | COG0839 | 5775620 | 5776087 | - | [C] NADH:ubiquinone oxidoreductase subunit 6 (... | |
1419 | nuoA | CDS | COG0838 | 2982781 | 2983194 | + | [C] NADH:ubiquinone oxidoreductase subunit 3 (... | |
1208 | cds-NP_250907.1 | CDS | COG1012 | 2437428 | 2439011 | + | [C] NAD-dependent aldehyde dehydrogenases | |
1459 | cds-NP_251404.1 | CDS | COG0243 | 3068040 | 3070349 | + | [C] Anaerobic dehydrogenases, typically seleno... | |
1410 | clpA | CDS | COG0542 | 2962303 | 2964579 | - | [O] ATPases with chaperone activity, ATP-bindi... | |
2335 | cds-NP_253023.1 | CDS | COG1951 | 4861653 | 4863176 | + | [C] Tartrate dehydratase alpha subunit/Fumarat... | |
1609 | morB | CDS | COG1902 | 3287340 | 3288449 | + | [C] NADH:flavin oxidoreductases, Old Yellow En... | |
1611 | cds-NP_251626.1 | CDS | COG3038 | 3291283 | 3291864 | + | [C] Cytochrome B561 | |
2732 | cds-NP_253766.1 | CDS | COG1490 | 5718882 | 5719319 | - | [J] D-Tyr-tRNAtyr deacylase | |
649 | cds-NP_249883.1 | CDS | COG0037 | 1293267 | 1294091 | + | [D] Predicted ATPase of the PP-loop superfamily | |
866 | sucC | CDS | COG0045 | 1730181 | 1731347 | + | [C] Succinyl-CoA synthetase, beta subunit | |
251 | cds-NP_249150.1 | CDS | COG0542 | 518083 | 520635 | + | [O] ATPases with chaperone activity, ATP-bindi... | |
2553 | pnp | CDS | COG1185 | 5323374 | 5325479 | - | [J] Polyribonucleotide nucleotidyltransferase ... | |
1133 | fusA2 | CDS | COG0480 | 2272460 | 2274568 | + | [J] Translation elongation factors (GTPases) | |
1465 | cds-NP_251415.1 | CDS | COG0542 | 3077312 | 3079195 | + | [O] ATPases with chaperone activity, ATP-bindi... | |
2628 | cds-NP_253568.1 | CDS | COG0444 | 5477754 | 5478095 | + | [EP] ABC-type dipeptide/oligopeptide/nickel tr... | |
2008 | lysS | CDS | COG1190 | 4140884 | 4142389 | - | [J] Lysyl-tRNA synthetase (class II) | |
2872 | rpmG | CDS | COG0267 | 5985716 | 5985871 | - | [J] Ribosomal protein L33 | |
2861 | cds-NP_253986.1 | CDS | COG0427 | 5967286 | 5969145 | - | [C] Acetyl-CoA hydrolase | |
2651 | azu | CDS | COG3241 | 5521664 | 5522110 | - | [C] Azurin | |
2954 | cds-NP_254157.1 | CDS | COG1186 | 6158949 | 6159563 | - | [J] Protein chain release factor B | |
1185 | cds-NP_250869.1 | CDS | COG2890 | 2400655 | 2401605 | - | [J] Methylase of polypeptide chain release fac... | |
197 | fdx1 | CDS | COG1146 | 406247 | 406498 | - | [C] Ferredoxin | |
2969 | polA | CDS | COG0749 | 6183784 | 6186525 | - | [L] DNA polymerase I - 3'-5' exonuclease | |
1902 | metG | CDS | COG0143 | 3895324 | 3897357 | + | [J] Methionyl-tRNA synthetase | |
1757 | cds-NP_251922.1 | CDS | COG0847 | 3618993 | 3619619 | - | [L] DNA polymerase III, epsilon subunit and | |
1686 | cds-NP_251760.1 | CDS | COG0714 | 3442823 | 3443803 | + | [R] MoxR-like ATPases | |
2000 | ppc | CDS | COG2352 | 4127756 | 4130392 | - | [C] Phosphoenolpyruvate carboxylase | |
2188 | cds-NP_252711.1 | CDS | COG1012 | 4502271 | 4503791 | + | [C] NAD-dependent aldehyde dehydrogenases | |
1069 | cds-NP_250654.1 | CDS | COG0488 | 2146961 | 2148526 | - | [R] ATPase components of ABC transporters with | |
2251 | tyrS | CDS | COG0162 | 4627947 | 4629185 | - | [J] Tyrosyl-tRNA synthetase | |
2756 | cds-NP_253814.1 | CDS | COG0219 | 5776086 | 5776547 | + | [J] Predicted rRNA methylase (SpoU class) |
with open("./sequencePse.fasta", "r") as file: #Выуживаю последовательность генома бактерии...
file.readline()
sequence = ''.join(map(lambda x: x.rstrip(), file.readlines()))
oper_seq_train = dict() #Словарь с ключами в виде номеров оперов и значениями, представляющими из себя последовательности соответствующего промотора.
for operon, row in Train_table.iterrows():
if row.Strand == "+":
start = int(row.PosLeft) - 150
end = int(row.PosLeft)
seq = sequence[(start-1):(end-1)]
else:
start = int(row.postRight)
end = int(row.postRight) + 150
seq = ''.join(map(lambda x: alph[x], sequence[(start-1):(end-1)][::-1]))
oper_seq_train[operon] = seq
set_all = set(Table.index)
set_train = set(oper_seq_train.keys())
set_test = set_all.difference(set_train) #Все опероны, не попавшие в train, идут в test.
Test_table = Table.loc[sorted(list(map(int, set_test))), :]
Test_table
Operon | IdGene | Type | COGgene | PosLeft | postRight | Strand | Function | |
---|---|---|---|---|---|---|---|---|
1 | dnaA | CDS | COG0593 | 483 | 2027 | + | [L] ATPase involved in DNA replication initiation | |
2 | lptA | CDS | COG0204 | 7018 | 7791 | - | [I] 1-acyl-sn-glycerol-3-phosphate acyltransfe... | |
3 | cds-NP_064727.1 | CDS | ROG0444 | 8671 | 10377 | + | NA | |
4 | glyS | CDS | COG0751 | 10434 | 12488 | - | [J] Glycyl-tRNA synthetase, beta subunit | |
5 | tag | CDS | COG2818 | 13540 | 14091 | + | [L] 3-methyladenine DNA glycosylase | |
... | ... | ... | ... | ... | ... | ... | ... | ... |
3003 | glmS | CDS | COG0449 | 6243111 | 6244946 | - | [M] Glucosamine 6-phosphate synthetase, contai... | |
3004 | cds-NP_254238.1 | CDS | COG0739 | 6245803 | 6246312 | - | [M] Membrane proteins related to metalloendope... | |
3005 | spoOJ | CDS | COG1475 | 6254972 | 6255844 | - | [K] Predicted transcriptional regulators | |
3006 | cds-NP_254253.1 | CDS | NA | 6259671 | 6260054 | - | NA | |
3007 | cds-NP_254254.1 | CDS | COG0486 | 6260390 | 6261757 | - | [R] Predicted GTPase |
2947 rows × 8 columns
oper_seq_test = dict() #Словарь с ключами в виде номеров оперов и значениями, представляющими из себя последовательности соответствующего промотора.
for operon, row in Test_table.iterrows():
if row.Strand == "+":
start = int(row.PosLeft) - 150
end = int(row.PosLeft)
seq = sequence[(start-1):(end-1)]
else:
start = int(row.postRight)
end = int(row.postRight) + 150
seq = ''.join(map(lambda x: alph[x], sequence[(start-1):(end-1)][::-1]))
oper_seq_test[operon] = seq
def aux_function(x): #Вспомогательная функция для проверки пересечения случайно выбранного интервала с одним из промоторов.
if x[2] == "+":
start_pr = int(x[0]) - 150
end_pr = int(x[0])
else:
start_pr = int(x[1])
end_pr = int(x[1]) + 150
return (start < start_pr)*(end < start_pr) or (start > end_pr)*(end > end_pr)
n_neg = 0 #Формирование негативного контроля, состоящего из последовательностей вне промоторов (в том же количестве, что и test).
seq_neg = []
while n_neg != len(oper_seq_test):
start = np.random.randint(1, len(sequence) + 1)
end = start + 150
if np.all(list(map(aux_function, Table.iloc[:, 4:7].values))):
seq = sequence[(start-1):(end-1)]
seq_neg.append(seq)
n_neg += 1
else:
continue
with open("Train.fasta", "w") as file:
for operon, seq in oper_seq_train.items():
print(f">Operon{operon}", file=file)
print(seq, file=file)
with open("Test.fasta", "w") as file:
for operon, seq in oper_seq_test.items():
print(f">Operon{operon}", file=file)
print(seq, file=file)
with open("Negative.fasta", "w") as file:
for ind, seq in enumerate(seq_neg, 1):
print(f">{ind}", file=file)
print(seq, file=file)
fimo0_01 = pd.read_csv('fimo0_01.tsv', sep='\t')
row = fimo0_01.shape[0]
print(f"Количество найденных находок: {row} ")
n_s = len(fimo0_01.sequence_name.dropna().unique())
print(f"Количество последовательностей, в которых нашлись находки (хотя бы одна): {n_s}")
fimo0_01.sequence_name.value_counts() #На Operon №2137 приходится 16 найденных мотивов! Это беда... Понижаем порог.
Количество найденных находок: 1708 Количество последовательностей, в которых нашлись находки (хотя бы одна): 859
Operon2137 16 Operon890 10 Operon99 10 Operon1177 10 Operon104 10 .. Operon603 1 Operon2784 1 Operon1253 1 Operon2696 1 Operon1519 1 Name: sequence_name, Length: 859, dtype: int64
fimo0_001 = pd.read_csv('fimo0_001.tsv', sep='\t')
row = fimo0_001.shape[0]
print(f"Количество найденных находок: {row} ")
n_s = len(fimo0_001.sequence_name.dropna().unique())
print(f"Количество последовательностей, в которых нашлись находки (хотя бы одна): {n_s}")
fimo0_001.sequence_name.value_counts() #Сильно лучше, но я бы попробовал ещё чуть-чуть уменьшить порог.
Количество найденных находок: 180 Количество последовательностей, в которых нашлись находки (хотя бы одна): 148
Operon104 3 Operon1400 3 Operon2293 3 Operon890 3 Operon2318 3 .. Operon912 1 Operon913 1 Operon141 1 Operon2115 1 Operon1521 1 Name: sequence_name, Length: 148, dtype: int64
fimo0_0005 = pd.read_csv('fimo0_0005.tsv', sep='\t')
row = fimo0_0005.shape[0]
print(f"Количество найденных находок: {row} ")
n_s = len(fimo0_0005.sequence_name.dropna().unique())
print(f"Количество последовательностей, в которых нашлись находки (хотя бы одна): {n_s}")
fimo0_0005.sequence_name.value_counts() #Сильно лучше не стало, а количество находок сильно уменьшилось => будем брать порог 0.001
Количество найденных находок: 80 Количество последовательностей, в которых нашлись находки (хотя бы одна): 67
Operon104 3 Operon1608 2 Operon495 2 Operon2293 2 Operon2709 2 .. Operon1712 1 Operon1447 1 Operon1278 1 Operon2283 1 Operon2012 1 Name: sequence_name, Length: 67, dtype: int64
fimo0_0001 = pd.read_csv('fimo0_0001.tsv', sep='\t')
row = fimo0_0001.shape[0]
print(f"Количество найденных находок: {row} ")
n_s = len(fimo0_0001.sequence_name.dropna().unique())
print(f"Количество последовательностей, в которых нашлись находки (хотя бы одна): {n_s}")
fimo0_0001.sequence_name.value_counts() #Отлично, но мало :)
Количество найденных находок: 18 Количество последовательностей, в которых нашлись находки (хотя бы одна): 14
Operon2709 2 Operon683 1 Operon1278 1 Operon1608 1 Operon495 1 Operon2293 1 Operon890 1 Operon1833 1 Operon1552 1 Operon2327 1 Operon1564 1 Operon2662 1 Operon1177 1 Operon104 1 Name: sequence_name, dtype: int64
fimo0_00001 = pd.read_csv('fimo0_00001.tsv', sep='\t')
row = fimo0_00001.shape[0]
print(f"Количество найденных находок: {row} ")
n_s = len(fimo0_00001.sequence_name.dropna().unique())
print(f"Количество последовательностей, в которых нашлись находки (хотя бы одна): {n_s}")
fimo0_00001.sequence_name.value_counts() #Самые лучшие находки... В небольшом количестве.
Количество найденных находок: 7 Количество последовательностей, в которых нашлись находки (хотя бы одна): 4
Operon683 1 Operon1278 1 Operon1608 1 Operon495 1 Name: sequence_name, dtype: int64
fimo0_001_neg = pd.read_csv('fimo0_001_Neg.tsv', sep='\t')
row = fimo0_001_neg.shape[0]
print(f"Количество найденных находок: {row} ")
n_s = len(fimo0_001_neg.sequence_name.dropna().unique())
print(f"Количество последовательностей, в которых нашлись находки (хотя бы одна): {n_s}")
fimo0_001_neg.sequence_name.value_counts() #Получилось 40 мотивов в 33 последовательностях... Сильно меньше, чем в тестовом наборе.
Количество найденных находок: 40 Количество последовательностей, в которых нашлись находки (хотя бы одна): 33
2929.0 2 1276.0 2 754.0 2 1924.0 2 873.0 1 2282.0 1 1679.0 1 1962.0 1 670.0 1 1941.0 1 1432.0 1 2554.0 1 1989.0 1 1829.0 1 255.0 1 2176.0 1 1714.0 1 325.0 1 1183.0 1 1461.0 1 1103.0 1 12.0 1 1900.0 1 649.0 1 1102.0 1 2258.0 1 858.0 1 1433.0 1 2136.0 1 906.0 1 2730.0 1 2568.0 1 890.0 1 Name: sequence_name, dtype: int64