Задание 1. Схема автомата для модели¶
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
test_seq = '''gataggattatcattcataagtttcagagcaatgtccttattctggaacttggatttatggctctt
ttggtttaatttcgcctgattcttgatctcctttagcttctcgacgtgggcctttttcttgccata
tggatccgctgcacggtcctgttccctagcatgtacgtgagcgtatttccttttaaaccacgacgc
tttgtcttcattcaacgtttcccattgtttttttctactattgctttgctgtgggaaaaacttatc
gaaagatgacgactttttcttaattctcgttttaagagcttggtgagcgctaggagtcactgccag'''
test_seq = ''.join(test_seq.split('\n'))
states = ['bg', 'S1', 'S2', 'S3', 'S4']
transition_matrix_test = np.array(
[[0.99, 0.01, 0, 0, 0],
[0, 0, 1, 0, 0],
[0, 0, 0, 1, 0],
[0, 0, 0, 0, 1],
[0.99, 0, 0, 0, 0]])
emission_matrices_test = np.array([[0.25, 0.25, 0.25, 0.25],
[1, 0, 0, 0],
[0, 1, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1]])
initial_transitions_test = [0.99, 0.01, 0, 0, 0]
final_transitions_test = [0.01, 0, 0, 0, 0.01]
Задание 2. Алгоритм Витерби¶
# Viterbi algorithm
def Vitebri(seq, states, transition_matrix, emission_matrices, initial=None, final=None):
"""
Parameters
----------
seq: str
A protein or nucleotide sequence (recommended)
states: 1d array
A list of states, including the initial and the final states. The initial and
the final states should not be included.
transition_matrix: 2d array
A probability matrix for transitions to all the hidden states. The probabilities
should correspond to the states listed in "states" variable
emission_matrices: 2d array
A matrix for probabilities of observations given a hidden state (excluding
the initial and the final states). The probabilities should correspond to the
sequence alphabet in alphabetical order
initial: 1d array, optional
A list of probabilities for transitioning from the initial state. Default
probabilities are equal for every hidden state
final: 1d array, optional
A list of probabilities for transitioning to the final state. Default
probabilities are equal for every hidden state
"""
# if no optional parameters were given
if not initial:
initial = [1/len(states) for i in states]
if not final:
final = [1/len(states) for i in states]
# creating a n_states * seq_length matrix for interim likelihoods
alphabet = sorted(list(set(seq)))
dyn_matrix = np.array([[0.0 for i in range(len(seq))] for j in range(len(states))])
letter = alphabet.index(seq[0])
# creating a n_states * se_length matrix for an optimal path
path = [['' for i in seq] for j in states]
# calculating probabilities for the first letter
for state in range(len(states)):
prob = initial[state] * emission_matrices_test[state][letter]
dyn_matrix[state][0] = prob
prev_letter = letter
# Calculating dyn_matrix[state][i] for every state; choose maximum value out
# of all previous scores multiplied by the probability of transitioning from
# the previous to current state and a corresponding probability from the
# emission matrix
for i in range(1, len(seq)):
letter = alphabet.index(seq[i])
for state in range(len(states)):
for prev_state in range(len(states)):
lh = dyn_matrix[prev_state][i - 1] * transition_matrix[prev_state][state] * emission_matrices[state][prev_letter]
if lh > dyn_matrix[state][i]:
dyn_matrix[state][i] = lh
path[state][i] = prev_state
prev_letter = letter
# calculating final likelihoods
for state in range(len(states)):
dyn_matrix[state][-1] *= final[state]
prev_letter = letter
# Reverse traversal to find a path with the maximum likelihood
prev_state = np.argmax([x[-1] for x in dyn_matrix])
print(f'Likelihood of an optimal path is {np.max([x[-1] for x in dyn_matrix])}')
reversed_path = []
reversed_path.append(prev_state)
for i in range(len(seq) - 1, 0, -1):
prev_state = path[prev_state][i]
reversed_path.append(prev_state)
return reversed_path[::-1]
def find_sites(opt_path, site_states, site=''):
'''
Parameters:
opt_path: 1d array
Optimal path found by Vitebri algorithm
states: 1d array
An array of consecutive hidden states corresponding to the restriction site
site: str, optional
A sequence of your site
-----------
'''
cnt = 0
for i, st in enumerate(opt_path):
if st == site_states[cnt] - 1:
cnt += 1
if cnt == len(site_states):
print(f'Found restriction site {site} in position {i - cnt + 1} - {i}')
cnt = 0
res1 = Vitebri(test_seq, states, transition_matrix_test,
emission_matrices_test, initial_transitions_test,
final_transitions_test)
Likelihood of an optimal path is 1.4352073166535307e-201
find_sites(res1, [1, 2, 3, 4], 'ACGT')
Found restriction site ACGT in position 109 - 112 Found restriction site ACGT in position 166 - 169 Found restriction site ACGT in position 212 - 215
Проверим, совпадают ли результаты с обычным поиском по строке (а они должны).
import re
[x.span() for x in re.finditer('acgt', test_seq)]
[(109, 113), (166, 170), (212, 216)]
Теперь изменим вероятность перехода из фона в сайт с 0.01 на 0.001.
transition_matrix_test[0][1] = 0.001
res1_1 = Vitebri(test_seq, states, transition_matrix_test,
emission_matrices_test, initial_transitions_test,
final_transitions_test)
transition_matrix_test[0][1] = 0.01
find_sites(res1_1, [1, 2, 3, 4], 'ACGT')
Likelihood of an optimal path is 7.582581360685768e-203
Как видно, ничего не нашлось. Наиболее оптимальный путь в рамках данной модели не содержал в себе переходы в сайты рестрикции: они были приняты за фоновую модель.
Задание 3. Алгоритм Forward-backward¶
# Forward-backward algorigthm
def forward_backward(seq, states, transition_matrix, emission_matrices, initial=None, final=None):
'''
Refer to Vitebri for parameter description
'''
# if no optional parameters were given
if not initial:
initial = [1/len(states) for i in states]
if not final:
final = [1/len(states) for i in states]
# Forward step
forward_probabilities = np.array([[0.0 for i in range(len(seq))] for j in range(len(states))])
alphabet = sorted(list(set(seq)))
letter = alphabet.index(seq[0])
# Calculating first probabilities
for state in range(len(states)):
forward_probabilities[state][0] = initial[state]
# Calculating forward probability by summing previous forward probabilities
# multiplied by transition probability (prev_state -> cur_state) and multiplying
# it by a corresponding emission probability given a current state
for i in range(1, len(seq)):
letter = alphabet.index(seq[i])
for state in range(len(states)):
for prev_state in range(len(states)):
forward_probabilities[state][i] += forward_probabilities[prev_state][i - 1] * transition_matrix[prev_state][state]
forward_probabilities[state][i] *= emission_matrices[state][letter]
# Calculating final state probabilities
for state in range(len(states)):
forward_probabilities[state][-1] *= final[state]
# Backward step
backward_probabilities = np.array([[0.0 for i in range(len(seq))] for j in range(len(states))])
# Calculating final state probabilities
for state in range(len(states)):
backward_probabilities[state][-1] = final[state]
# Calculating these probabilities is basically the same as the forward part
for i in range(len(seq) - 2, -1, -1):
letter = alphabet.index(seq[i + 1])
for prev_state in range(len(states)):
for state in range(len(states)):
backward_probabilities[state][i] += backward_probabilities[prev_state][i + 1] * transition_matrix[state][prev_state] * emission_matrices[prev_state][letter]
# Calculating initial state probabilities
for state in range(len(states)):
backward_probabilities[state][0] *= initial[state]
full_seq_prob = sum((x[-1] for x in forward_probabilities))
# Calculating aposterior probabilities
apost = forward_probabilities * backward_probabilities / full_seq_prob
return apost
res2 = forward_backward(test_seq, states, transition_matrix_test,
emission_matrices_test, initial_transitions_test,
final_transitions_test)
При построении графика можем заменить резкие падения в вероятностях для фонового состояния в местах, где находятся сайты рестрикции (их обнаружено 3, как и в предыдущем случае).
def draw_task2(res):
fig, ax = plt.subplots()
for state in range(len(res)):
plt.plot(res[state], label=states[state])
ax.set_title('Restriction sites found by forward-backward algorithm')
plt.legend()
plt.show()
draw_task2(res2)
Попробуем поменять вероятности перехода из фона в сайт.Сначала увеличим ее до 0.1 (изначальное значение 0.01), чтобы увидеть заметные изменения. И действительно, вероятность для состояний, соответствующих сайту рестрикции, теперь выше.
transition_matrix_test[0][1] = 0.1
res2_1 = forward_backward(test_seq, states, transition_matrix_test,
emission_matrices_test, initial_transitions_test,
final_transitions_test)
transition_matrix_test[0][1] = 0.01
draw_task2(res2_1)
В противоположной ситуации (при замене указанного значения с 0.01 на 0.001) видно, что вероятности, которые указывают на нахождения рестриктных сайтов, становятся меньше. Если уменьшать значение вероятности перехода из фона в сайт еще дальше, рестриктные сайты будут приняты за фон (например, для значения 0.00001)
transition_matrix_test[0][1] = 0.001
res2_1 = forward_backward(test_seq, states, transition_matrix_test,
emission_matrices_test, initial_transitions_test,
final_transitions_test)
transition_matrix_test[0][1] = 0.01
draw_task2(res2_1)
transition_matrix_test[0][1] = 0.00001
res2_1 = forward_backward(test_seq, states, transition_matrix_test,
emission_matrices_test, initial_transitions_test,
final_transitions_test)
transition_matrix_test[0][1] = 0.01
draw_task2(res2_1)
Задание 4. Генерация последовательности с сайтами рестрикции¶
# Sequence generation
def generate_hmm_seq(alphabet, states, transition_matrix, emission_matrices, initial=None, final=None):
'''
Parameters:
-----------
length: int, optional
length of a generated sequence
For the rest refer to Vitebri for parameter description
'''
# if no optional parameters were given
if not initial:
initial = [1/len(states) for i in states]
if not final:
final = [1/len(states) for i in states]
sequence = ''
all_states = list(np.append(np.insert(states, 0, 'Beg'), 'End'))
state = states.index(np.random.choice(states, p=initial))
prev_state = state
while all_states[state] != 'End':
state = states.index(np.random.choice(states, p=transition_matrix[prev_state]))
letter = np.random.choice(alphabet, p=emission_matrices[state])
all_probs = np.append(np.insert(transition_matrix[state], 0, 0), final[state])
all_probs[1] -= final[state]
end = np.random.choice(all_states, p=all_probs)
if end == 'End':
state = all_states.index(end)
sequence += letter
return sequence
res_3 = generate_hmm_seq(['A', 'C', 'G', 'T'], states, transition_matrix_test,
emission_matrices_test, initial_transitions_test,
final_transitions_test)
res_3
'ATGTTAGTCGACTAGAGACTGCTACATAGGACTACAGTTCGACATAAAATCGTTCGGCTTTCACTAGTCCCCGCACCAGCCTAAGGGTTACGTCTATAAGGCGTAGTACCCCTCTCAGACGTGAAGGAAGGGTGATCGCTTTGTAATAGCCAGACCCGCGCGGGTAAATAGCGCGTGTTAGGATCGCAGAGCTTCG'
В сгенерированной случайной последовательности обнаружены 2 сайта рестрикции при помощи алгоритма Витебри и алгоритма "Вперед-назад".
res3_1 = forward_backward(res_3, states, transition_matrix_test,
emission_matrices_test, initial_transitions_test,
final_transitions_test)
draw_task2(res3_1)
res3_2 = Vitebri(res_3, states, transition_matrix_test,
emission_matrices_test, initial_transitions_test,
final_transitions_test)
find_sites(res3_2, [1, 2, 3, 4], 'ACGT')
Likelihood of an optimal path is 9.820604350736465e-121 Found restriction site ACGT in position 89 - 92 Found restriction site ACGT in position 118 - 121