import numpy as np
import matplotlib.pyplot as plt
#the Viterbi algorithm
#I know, that the states "B" and "E" occur only in the beginning and the end of the sequence, so there is no need to check their probabilities every letter.
#To decrease the number of calculations, I excluded them from the list of states ( HMM_pattern.self.states = ["Bg", "S1", "S2", "S3", "S4"] )
#Thus, I have to deal with the first ("B") and the last ("E") states separatly from others
def viterbi(HMM_pattern, seq):
probabilities = []
ancestors = [["anc" for _ in range(len(seq)-2)] for _ in range(len(HMM_pattern.states))]
probs = [[0 for _ in range(len(seq)-2)] for _ in range(len(HMM_pattern.states))]
#prev_state = "B"
#cur_state = seq[0]
#best_prob = 0
#deal with the first state
for state_ind, state in enumerate(HMM_pattern.states):
prob = HMM_pattern.transition_probs['B'][state]*HMM_pattern.outcomes_probs[state][seq[1]]
ancestors[state_ind][0] = "B"
probs[state_ind][0] = prob
#deal with all the letters in the middle
for letter in range(2, len(seq)-1):
for state_ind, state in enumerate(HMM_pattern.states):
best_prob = 0
#prob_letter_given_state = HMM_pattern.outcomes_probs[state][seq[letter]]
for prev_ind, prev in enumerate(HMM_pattern.states):
#prob_come_to_state = HMM_pattern.transition_probs[prev][state] #вероятность прийти из В в состояние ст
prob = HMM_pattern.transition_probs[prev][state]*HMM_pattern.outcomes_probs[state][seq[letter]]*probs[prev_ind][letter-2]
if prob>best_prob:
best_prob = prob
best_anc = prev
ancestors[state_ind][letter-1] = best_anc
probs[state_ind][letter-1] = best_prob
#deal with the last transition
best_preend_prob = 0
for state_ind, state in enumerate(HMM_pattern.states):
if probs[state_ind][-1] * HMM_pattern.transition_probs[state]["E"] > best_preend_prob:
best_preend_state = state
best_preend_prob = probs[state_ind][-1] * HMM_pattern.transition_probs[state]["E"]
#going backward to find the best route
site_indices = []
path_reversed = [best_preend_state]
for ind in range (len(seq)-3, 0, -1):
optimal_state = ancestors[HMM_pattern.states.index(path_reversed[-1])][ind]
path_reversed.append(optimal_state)
if optimal_state == "S1":
site_indices.append(ind-1)
site_indices.reverse()
#the algorithm returns supposed indices of the restriction site
return(site_indices)
def forward_backward(HMM_pattern, seq):
#probabilities = []
backward = [[0 for _ in range(len(seq)-2)] for _ in range(len(HMM_pattern.states))]
forward = [[0 for _ in range(len(seq)-2)] for _ in range(len(HMM_pattern.states))]
final = [[0 for _ in range(len(seq)-2)] for _ in range(len(HMM_pattern.states))]
# Forward pass
for state_idx, state in enumerate(HMM_pattern.states):
prob_state = HMM_pattern.transition_probs['B'][state]
prob_letter_given_state = HMM_pattern.outcomes_probs[state][seq[1]]
forward[state_idx][0] = prob_state * prob_letter_given_state
for letter in range(2, len(seq)-1):
for state_idx, state in enumerate(HMM_pattern.states):
fk = 0
prob_letter_given_state = HMM_pattern.outcomes_probs[state][seq[letter]]
for prev_idx, prev in enumerate(HMM_pattern.states):
prob_come_to_state = HMM_pattern.transition_probs[prev][state]
fk += prob_come_to_state * prob_letter_given_state * forward[prev_idx][letter-2]
forward[state_idx][letter-1] = fk
#fE
fE = sum(forward[state_idx][-1] for state_idx in range(len(HMM_pattern.states)))
# Backward pass
for state_idx, state in enumerate(HMM_pattern.states):
prob_state = HMM_pattern.transition_probs[state]["E"]
prob_letter_given_state = HMM_pattern.outcomes_probs[state][seq[-2]]
backward[state_idx][-1] = prob_state * prob_letter_given_state
for letter in range(len(seq)-3, 0, -1):
for state_idx, state in enumerate(HMM_pattern.states):
bk = 0
prob_letter_given_state = HMM_pattern.outcomes_probs[state][seq[letter]]
for next_idx, next_state in enumerate(HMM_pattern.states):
prob_come_to_state = HMM_pattern.transition_probs[state][next_state]
bk += prob_come_to_state * prob_letter_given_state * backward[next_idx][letter]
backward[state_idx][letter-1] = bk
#implementing the formula fk(i)*bk(i)/fE(L), multiplying by the coefficient so that the probabilities add up to 1
for letter in range(len(seq)-2):
sum_position = 0
for state in range(len(HMM_pattern.states)):
total = forward[state][letter] * backward[state][letter] / fE
final[state][letter] = total
sum_position+=total
coeff = 1/sum_position
for state in range(len(HMM_pattern.states)):
final[state][letter] = final[state][letter]*coeff
site_indices = []
for position in range(len(seq)-2):
if final[1][position] > 0.5:
site_indices.append(position)
#the algorithm returns the supposed indices of the restriction site and the matrix of probabilities of each state at each position - in the next section I will visualize them
return site_indices, final
#class of the HMM, containing all the needed probabilities (a(Bg → site)=0.01)
class HMM_01:
def __init__(self):
self.states = ["Bg", "S1", "S2", "S3", "S4"]
#probabilities of transitions from one state to another
self.transition_probs = {
"B": {"B": 0, "Bg" : 0.99, "S1" : 0.01, "S2": 0, "S3": 0, "S4": 0, "E" : 0},
"Bg": {"B" : 0, "Bg" : 0.98, "S1" : 0.01, "S2": 0, "S3": 0, "S4": 0, "E" : 0.01},
"S1" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 1, "S3" : 0, "S4" : 0, "E" : 0},
"S2" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 1, "S4" : 0, "E" : 0},
"S3" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 1, "E" : 0},
"S4" : {"B" : 0, "Bg" : 0.99, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 0, "E" : 0.01},
"E" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 0, "E" : 1}
}
#probabilities of letters given each state
self.outcomes_probs = {
"B" : {"a" : 0, "c": 0, "g": 0, "t": 0, "B": 1, "E": 0},
"Bg": {"a" : 0.25, "c": 0.25, "g": 0.25, "t": 0.25, "B": 0, "E": 0},
"S1": {"a" : 1, "c": 0, "g": 0, "t": 0, "B": 0, "E": 0},
"S2": {"a" : 0, "c": 1, "g": 0, "t": 0, "B": 0, "E": 0},
"S3": {"a" : 0, "c": 0, "g": 1, "t": 0, "B": 0, "E": 0},
"S4": {"a" : 0, "c": 0, "g": 0, "t": 1, "B": 0, "E": 0},
"E" : {"a" : 0, "c": 0, "g": 0, "t": 0, "B": 0, "E": 1}
}
test_seq = "gataggattatcattcataagtttcagagcaatgtccttattctggaacttggatttatggctcttttggtttaatttcgcctgattcttgatctcctttagcttctcgacgtgggcctttttcttgccatatggatccgctgcacggtcctgttccctagcatgtacgtgagcgtatttccttttaaaccacgacgctttgtcttcattcaacgtttcccattgtttttttctactattgctttgctgtgggaaaaacttatcgaaagatgacgactttttcttaattctcgttttaagagcttggtgagcgctaggagtcactgccag"
seq = "B" + test_seq + "E"
HMM_pattern = HMM_01()
Viterbi algorithm - it finds three sites on the position 109, 166 and 212, as it should
viterbi_1 = viterbi(HMM_pattern, seq)
for site in viterbi_1:
print("Restriction site:", site, site+3)
Restriction site: 109 112 Restriction site: 166 169 Restriction site: 212 215
Forward-backward algorithm - same result
fb_1_ind, fb_1 = forward_backward(HMM_pattern, seq)
labels = ["background", "S1", "S2", "S3", "S4"]
plt.figure(figsize=(15,6))
for i in range(len(fb_1)):
plt.plot(range(1,len(fb_1[i])+1), fb_1[i], label=labels[i])
plt.xlabel('Position')
plt.ylabel('Probability')
plt.title('a(Bg → site)=0.01')
plt.legend()
plt.grid(True)
plt.show()
for site in fb_1_ind:
print("Restriction site:", site, site+3)
Restriction site: 109 112 Restriction site: 166 169 Restriction site: 212 215
#change the probabilities (a(Bg → site)=0.001)
class HMM_001:
def __init__(self):
self.states = ["Bg", "S1", "S2", "S3", "S4"]
#probabilities of transitions from one state to another
self.transition_probs = {
"B": {"B": 0, "Bg" : 0.99, "S1" : 0.01, "S2": 0, "S3": 0, "S4": 0, "E" : 0},
"Bg": {"B" : 0, "Bg" : 0.989, "S1" : 0.001, "S2": 0, "S3": 0, "S4": 0, "E" : 0.01},
"S1" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 1, "S3" : 0, "S4" : 0, "E" : 0},
"S2" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 1, "S4" : 0, "E" : 0},
"S3" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 1, "E" : 0},
"S4" : {"B" : 0, "Bg" : 0.99, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 0, "E" : 0.01},
"E" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 0, "E" : 1}
}
#probabilities of letters given each state
self.outcomes_probs = {
"B" : {"a" : 0, "c": 0, "g": 0, "t": 0, "B": 1, "E": 0},
"Bg": {"a" : 0.25, "c": 0.25, "g": 0.25, "t": 0.25, "B": 0, "E": 0},
"S1": {"a" : 1, "c": 0, "g": 0, "t": 0, "B": 0, "E": 0},
"S2": {"a" : 0, "c": 1, "g": 0, "t": 0, "B": 0, "E": 0},
"S3": {"a" : 0, "c": 0, "g": 1, "t": 0, "B": 0, "E": 0},
"S4": {"a" : 0, "c": 0, "g": 0, "t": 1, "B": 0, "E": 0},
"E" : {"a" : 0, "c": 0, "g": 0, "t": 0, "B": 0, "E": 1}
}
HMM_pattern = HMM_001()
Viterbi algorithm - this time it doesn't find the site - it is more probable to see "acgt" as a part of the background than to enter "site state"
(a(Bg → site)=0.001)
viterbi_2 = viterbi(HMM_pattern, seq)
for site in viterbi_2:
print("Restriction site:", site, site+3)
print(len(viterbi_2))
0
Forward-backward algorithm, on the other hand, finds the sites, but this time it's much less "sure" about the result - the probabilities of the site are lower.
fb_2_ind, fb_2 = forward_backward(HMM_pattern, seq)
labels = ["background", "S1", "S2", "S3", "S4"]
plt.figure(figsize=(15,6))
for i in range(len(fb_2)):
plt.plot(range(1,len(fb_2[i])+1), fb_2[i], label=labels[i])
plt.xlabel('Position')
plt.ylabel('Probability')
plt.title('a(Bg → site)=0.001')
plt.legend()
plt.grid(True)
plt.show()
for site in fb_2_ind:
print("Restriction site:", site, site+3)
Restriction site: 109 112 Restriction site: 166 169 Restriction site: 212 215
But if we continue to lower the value a(Bg → site), FB-algorith will also "lose" the sites. However, they won't be lost completely unlike the case of Viterbi algorithm, because we still see non-zero probabilities of the sites
#change the probabilities (a(Bg → site)=0.001)
class HMM_0001:
def __init__(self):
self.states = ["Bg", "S1", "S2", "S3", "S4"]
#probabilities of transitions from one state to another
self.transition_probs = {
"B": {"B": 0, "Bg" : 0.99, "S1" : 0.01, "S2": 0, "S3": 0, "S4": 0, "E" : 0},
"Bg": {"B" : 0, "Bg" : 0.9899, "S1" : 0.0001, "S2": 0, "S3": 0, "S4": 0, "E" : 0.01},
"S1" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 1, "S3" : 0, "S4" : 0, "E" : 0},
"S2" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 1, "S4" : 0, "E" : 0},
"S3" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 1, "E" : 0},
"S4" : {"B" : 0, "Bg" : 0.99, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 0, "E" : 0.01},
"E" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 0, "E" : 1}
}
#probabilities of letters given each state
self.outcomes_probs = {
"B" : {"a" : 0, "c": 0, "g": 0, "t": 0, "B": 1, "E": 0},
"Bg": {"a" : 0.25, "c": 0.25, "g": 0.25, "t": 0.25, "B": 0, "E": 0},
"S1": {"a" : 1, "c": 0, "g": 0, "t": 0, "B": 0, "E": 0},
"S2": {"a" : 0, "c": 1, "g": 0, "t": 0, "B": 0, "E": 0},
"S3": {"a" : 0, "c": 0, "g": 1, "t": 0, "B": 0, "E": 0},
"S4": {"a" : 0, "c": 0, "g": 0, "t": 1, "B": 0, "E": 0},
"E" : {"a" : 0, "c": 0, "g": 0, "t": 0, "B": 0, "E": 1}
}
HMM_pattern = HMM_0001()
fb_3_ind, fb_3 = forward_backward(HMM_pattern, seq)
labels = ["background", "S1", "S2", "S3", "S4"]
plt.figure(figsize=(15,6))
for i in range(len(fb_3)):
plt.plot(range(1,len(fb_3[i])+1), fb_3[i], label=labels[i])
plt.xlabel('Position')
plt.ylabel('Probability')
plt.title('a(Bg → site)=0.0001')
plt.legend()
plt.grid(True)
plt.show()
print("sites found:", len(fb_3_ind))
sites found: 0
HMM_pattern = HMM_01()
#function that generates sequence according the HMM
#note, that the length is undetermined - the sequence end when reaches "E" state
def generate_HMM(HMM_pattern):
seq_states = [ "B" ]
seq = [ "B" ]
#next state and letter are chosen based on their probabilities
while seq[-1] != 'E':
next_state = np.random.choice(list(HMM_pattern.transition_probs[seq_states[-1]].keys()), p=list(HMM_pattern.transition_probs[seq_states[-1]].values()))
next_letter = np.random.choice(list(HMM_pattern.outcomes_probs[next_state].keys()), p=list(HMM_pattern.outcomes_probs[next_state].values()))
seq_states.append(next_state)
seq.append(next_letter)
return(''.join(seq))
np.random.seed(123) #set seed for reproducibility
generated_seq = generate_HMM(HMM_pattern)
generated_seq
'BcgcacgtgagtgcagccctgacctgtgcggggaagtggaggctacttgccgaggcctggtagtgcatacgtgtattcatctattgataacattggggggaatggagtccattagaatatgcacgtccttcaE'
viterbi_myseq = viterbi(HMM_pattern, generated_seq)
for site in viterbi_myseq:
print("Restriction site:", site, site+3)
Restriction site: 3 6 Restriction site: 67 70 Restriction site: 121 124
fb_myseq_ind, fb_myseq = forward_backward(HMM_pattern, generated_seq)
labels = ["background", "S1", "S2", "S3", "S4"]
plt.figure(figsize=(10,6))
for i in range(len(fb_myseq)):
plt.plot(range(1,len(fb_myseq[i])+1), fb_myseq[i], label=labels[i])
plt.xlabel('Position')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.show()
for site in fb_myseq_ind:
print("Restriction site:", site, site+3)
Restriction site: 3 6 Restriction site: 67 70 Restriction site: 121 124
The results of the algorithms match, the both find 3 restriction site