In [143]:
import numpy as np
import matplotlib.pyplot as plt

HMM model for ACGT site¶

Screenshot 2024-04-05 at 18.15.30.png

Algorithms implementations¶

Viterbi algorithm¶

In [144]:
#the Viterbi algorithm

#I know, that the states "B" and "E" occur only in the beginning and the end of the sequence, so there is no need to check their probabilities every letter.
#To decrease the number of calculations, I excluded them from the list of states ( HMM_pattern.self.states = ["Bg", "S1", "S2", "S3", "S4"] )
#Thus, I have to deal with the first ("B") and the last ("E") states separatly from others

def viterbi(HMM_pattern, seq):
  probabilities = []
  ancestors = [["anc" for _ in range(len(seq)-2)] for _ in range(len(HMM_pattern.states))]
  probs = [[0 for _ in range(len(seq)-2)] for _ in range(len(HMM_pattern.states))]

  #prev_state = "B"
  #cur_state = seq[0]
  #best_prob = 0

  #deal with the first state
  for state_ind, state in enumerate(HMM_pattern.states):
    prob = HMM_pattern.transition_probs['B'][state]*HMM_pattern.outcomes_probs[state][seq[1]]
    ancestors[state_ind][0] = "B"
    probs[state_ind][0] = prob

  #deal with all the letters in the middle
  for letter in range(2, len(seq)-1):
    for state_ind, state in enumerate(HMM_pattern.states):
      best_prob = 0
      #prob_letter_given_state = HMM_pattern.outcomes_probs[state][seq[letter]]
      for prev_ind, prev in enumerate(HMM_pattern.states):
        #prob_come_to_state = HMM_pattern.transition_probs[prev][state] #вероятность прийти из В в состояние ст
        prob = HMM_pattern.transition_probs[prev][state]*HMM_pattern.outcomes_probs[state][seq[letter]]*probs[prev_ind][letter-2]

        if prob>best_prob:
          best_prob = prob
          best_anc = prev

      ancestors[state_ind][letter-1] = best_anc
      probs[state_ind][letter-1] = best_prob

  #deal with the last transition
  best_preend_prob = 0
  for state_ind, state in enumerate(HMM_pattern.states):
    if probs[state_ind][-1] * HMM_pattern.transition_probs[state]["E"] > best_preend_prob:
      best_preend_state = state
      best_preend_prob = probs[state_ind][-1] * HMM_pattern.transition_probs[state]["E"]

  #going backward to find the best route

  site_indices = []
  path_reversed = [best_preend_state]
  for ind in range (len(seq)-3, 0, -1):
    optimal_state = ancestors[HMM_pattern.states.index(path_reversed[-1])][ind]
    path_reversed.append(optimal_state)
    if optimal_state == "S1":
      site_indices.append(ind-1)

  site_indices.reverse()

  #the algorithm returns supposed indices of the restriction site
  return(site_indices)

Forward-backward algorithm¶

In [145]:
def forward_backward(HMM_pattern, seq):
    #probabilities = []
    backward = [[0 for _ in range(len(seq)-2)] for _ in range(len(HMM_pattern.states))]
    forward = [[0 for _ in range(len(seq)-2)] for _ in range(len(HMM_pattern.states))]
    final = [[0 for _ in range(len(seq)-2)] for _ in range(len(HMM_pattern.states))]

    # Forward pass
    for state_idx, state in enumerate(HMM_pattern.states):
        prob_state = HMM_pattern.transition_probs['B'][state]
        prob_letter_given_state = HMM_pattern.outcomes_probs[state][seq[1]]
        forward[state_idx][0] = prob_state * prob_letter_given_state

    for letter in range(2, len(seq)-1):
        for state_idx, state in enumerate(HMM_pattern.states):
            fk = 0
            prob_letter_given_state = HMM_pattern.outcomes_probs[state][seq[letter]]
            for prev_idx, prev in enumerate(HMM_pattern.states):
                prob_come_to_state = HMM_pattern.transition_probs[prev][state]
                fk += prob_come_to_state * prob_letter_given_state * forward[prev_idx][letter-2]
            forward[state_idx][letter-1] = fk

    #fE
    fE = sum(forward[state_idx][-1] for state_idx in range(len(HMM_pattern.states)))

    # Backward pass
    for state_idx, state in enumerate(HMM_pattern.states):
        prob_state = HMM_pattern.transition_probs[state]["E"]
        prob_letter_given_state = HMM_pattern.outcomes_probs[state][seq[-2]]
        backward[state_idx][-1] = prob_state * prob_letter_given_state

    for letter in range(len(seq)-3, 0, -1):
        for state_idx, state in enumerate(HMM_pattern.states):
            bk = 0
            prob_letter_given_state = HMM_pattern.outcomes_probs[state][seq[letter]]
            for next_idx, next_state in enumerate(HMM_pattern.states):
                prob_come_to_state = HMM_pattern.transition_probs[state][next_state]
                bk += prob_come_to_state * prob_letter_given_state * backward[next_idx][letter]
            backward[state_idx][letter-1] = bk

    #implementing the formula fk(i)*bk(i)/fE(L), multiplying by the coefficient so that the probabilities add up to 1
    for letter in range(len(seq)-2):
      sum_position = 0
      for state in range(len(HMM_pattern.states)):
        total = forward[state][letter] * backward[state][letter] / fE
        final[state][letter] = total
        sum_position+=total

      coeff = 1/sum_position

      for state in range(len(HMM_pattern.states)):
        final[state][letter] = final[state][letter]*coeff

    site_indices = []
    for position in range(len(seq)-2):
      if final[1][position] > 0.5:
        site_indices.append(position)

    #the algorithm returns the supposed indices of the restriction site and the matrix of probabilities of each state at each position - in the next section I will visualize them
    return site_indices, final

TESTS¶

a(Bg → site)=0.01¶

In [146]:
#class of the HMM, containing all the needed probabilities (a(Bg → site)=0.01)
class HMM_01:
  def __init__(self):
    self.states = ["Bg", "S1", "S2", "S3", "S4"]
    #probabilities of transitions from one state to another
    self.transition_probs = {
"B": {"B": 0, "Bg" : 0.99, "S1" : 0.01, "S2": 0, "S3": 0, "S4": 0, "E" : 0},
"Bg": {"B" : 0, "Bg" : 0.98, "S1" : 0.01, "S2": 0, "S3": 0, "S4": 0, "E" : 0.01},
"S1" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 1, "S3" : 0, "S4" : 0, "E" : 0},
"S2" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 1, "S4" : 0, "E" : 0},
"S3" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 1, "E" : 0},
"S4" : {"B" : 0, "Bg" : 0.99, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 0, "E" : 0.01},
"E" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 0, "E" : 1}
}
    #probabilities of letters given each state
    self.outcomes_probs = {
"B" : {"a" : 0, "c": 0, "g": 0, "t": 0, "B": 1, "E": 0},
"Bg": {"a" : 0.25, "c": 0.25, "g": 0.25, "t": 0.25, "B": 0, "E": 0},
"S1": {"a" : 1, "c": 0, "g": 0, "t": 0, "B": 0, "E": 0},
"S2": {"a" : 0, "c": 1, "g": 0, "t": 0, "B": 0, "E": 0},
"S3": {"a" : 0, "c": 0, "g": 1, "t": 0, "B": 0, "E": 0},
"S4": {"a" : 0, "c": 0, "g": 0, "t": 1, "B": 0, "E": 0},
"E" : {"a" : 0, "c": 0, "g": 0, "t": 0, "B": 0, "E": 1}
}
In [147]:
test_seq = "gataggattatcattcataagtttcagagcaatgtccttattctggaacttggatttatggctcttttggtttaatttcgcctgattcttgatctcctttagcttctcgacgtgggcctttttcttgccatatggatccgctgcacggtcctgttccctagcatgtacgtgagcgtatttccttttaaaccacgacgctttgtcttcattcaacgtttcccattgtttttttctactattgctttgctgtgggaaaaacttatcgaaagatgacgactttttcttaattctcgttttaagagcttggtgagcgctaggagtcactgccag"

seq = "B" + test_seq + "E"

HMM_pattern = HMM_01()

Viterbi algorithm - it finds three sites on the position 109, 166 and 212, as it should

In [148]:
viterbi_1 = viterbi(HMM_pattern, seq)

for site in viterbi_1:
    print("Restriction site:", site, site+3)
Restriction site: 109 112
Restriction site: 166 169
Restriction site: 212 215

Forward-backward algorithm - same result

In [149]:
fb_1_ind, fb_1 = forward_backward(HMM_pattern, seq)

labels = ["background", "S1", "S2", "S3", "S4"]

plt.figure(figsize=(15,6))
for i in range(len(fb_1)):
    plt.plot(range(1,len(fb_1[i])+1), fb_1[i], label=labels[i])

plt.xlabel('Position')
plt.ylabel('Probability')
plt.title('a(Bg → site)=0.01')
plt.legend()
plt.grid(True)
plt.show()

for site in fb_1_ind:
    print("Restriction site:", site, site+3)
Restriction site: 109 112
Restriction site: 166 169
Restriction site: 212 215

a(Bg → site)=0.001¶

In [150]:
#change the probabilities (a(Bg → site)=0.001)
class HMM_001:
  def __init__(self):
    self.states = ["Bg", "S1", "S2", "S3", "S4"]
    #probabilities of transitions from one state to another
    self.transition_probs = {
"B": {"B": 0, "Bg" : 0.99, "S1" : 0.01, "S2": 0, "S3": 0, "S4": 0, "E" : 0},
"Bg": {"B" : 0, "Bg" : 0.989, "S1" : 0.001, "S2": 0, "S3": 0, "S4": 0, "E" : 0.01},
"S1" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 1, "S3" : 0, "S4" : 0, "E" : 0},
"S2" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 1, "S4" : 0, "E" : 0},
"S3" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 1, "E" : 0},
"S4" : {"B" : 0, "Bg" : 0.99, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 0, "E" : 0.01},
"E" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 0, "E" : 1}
}
    #probabilities of letters given each state
    self.outcomes_probs = {
"B" : {"a" : 0, "c": 0, "g": 0, "t": 0, "B": 1, "E": 0},
"Bg": {"a" : 0.25, "c": 0.25, "g": 0.25, "t": 0.25, "B": 0, "E": 0},
"S1": {"a" : 1, "c": 0, "g": 0, "t": 0, "B": 0, "E": 0},
"S2": {"a" : 0, "c": 1, "g": 0, "t": 0, "B": 0, "E": 0},
"S3": {"a" : 0, "c": 0, "g": 1, "t": 0, "B": 0, "E": 0},
"S4": {"a" : 0, "c": 0, "g": 0, "t": 1, "B": 0, "E": 0},
"E" : {"a" : 0, "c": 0, "g": 0, "t": 0, "B": 0, "E": 1}
}

HMM_pattern = HMM_001()

Viterbi algorithm - this time it doesn't find the site - it is more probable to see "acgt" as a part of the background than to enter "site state"

(a(Bg → site)=0.001)

In [151]:
viterbi_2 = viterbi(HMM_pattern, seq)

for site in viterbi_2:
    print("Restriction site:", site, site+3)

print(len(viterbi_2))
0

Forward-backward algorithm, on the other hand, finds the sites, but this time it's much less "sure" about the result - the probabilities of the site are lower.

In [152]:
fb_2_ind, fb_2 = forward_backward(HMM_pattern, seq)

labels = ["background", "S1", "S2", "S3", "S4"]

plt.figure(figsize=(15,6))
for i in range(len(fb_2)):
    plt.plot(range(1,len(fb_2[i])+1), fb_2[i], label=labels[i])

plt.xlabel('Position')
plt.ylabel('Probability')
plt.title('a(Bg → site)=0.001')
plt.legend()
plt.grid(True)
plt.show()

for site in fb_2_ind:
    print("Restriction site:", site, site+3)
Restriction site: 109 112
Restriction site: 166 169
Restriction site: 212 215

But if we continue to lower the value a(Bg → site), FB-algorith will also "lose" the sites. However, they won't be lost completely unlike the case of Viterbi algorithm, because we still see non-zero probabilities of the sites

In [153]:
#change the probabilities (a(Bg → site)=0.001)
class HMM_0001:
  def __init__(self):
    self.states = ["Bg", "S1", "S2", "S3", "S4"]
    #probabilities of transitions from one state to another
    self.transition_probs = {
"B": {"B": 0, "Bg" : 0.99, "S1" : 0.01, "S2": 0, "S3": 0, "S4": 0, "E" : 0},
"Bg": {"B" : 0, "Bg" : 0.9899, "S1" : 0.0001, "S2": 0, "S3": 0, "S4": 0, "E" : 0.01},
"S1" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 1, "S3" : 0, "S4" : 0, "E" : 0},
"S2" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 1, "S4" : 0, "E" : 0},
"S3" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 1, "E" : 0},
"S4" : {"B" : 0, "Bg" : 0.99, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 0, "E" : 0.01},
"E" : {"B": 0, "Bg" : 0, "S1" : 0, "S2" : 0, "S3" : 0, "S4" : 0, "E" : 1}
}
    #probabilities of letters given each state
    self.outcomes_probs = {
"B" : {"a" : 0, "c": 0, "g": 0, "t": 0, "B": 1, "E": 0},
"Bg": {"a" : 0.25, "c": 0.25, "g": 0.25, "t": 0.25, "B": 0, "E": 0},
"S1": {"a" : 1, "c": 0, "g": 0, "t": 0, "B": 0, "E": 0},
"S2": {"a" : 0, "c": 1, "g": 0, "t": 0, "B": 0, "E": 0},
"S3": {"a" : 0, "c": 0, "g": 1, "t": 0, "B": 0, "E": 0},
"S4": {"a" : 0, "c": 0, "g": 0, "t": 1, "B": 0, "E": 0},
"E" : {"a" : 0, "c": 0, "g": 0, "t": 0, "B": 0, "E": 1}
}

HMM_pattern = HMM_0001()

fb_3_ind, fb_3 = forward_backward(HMM_pattern, seq)

labels = ["background", "S1", "S2", "S3", "S4"]

plt.figure(figsize=(15,6))
for i in range(len(fb_3)):
    plt.plot(range(1,len(fb_3[i])+1), fb_3[i], label=labels[i])

plt.xlabel('Position')
plt.ylabel('Probability')
plt.title('a(Bg → site)=0.0001')
plt.legend()
plt.grid(True)
plt.show()

print("sites found:", len(fb_3_ind))
sites found: 0

Generative HMM model¶

In [154]:
HMM_pattern = HMM_01()

#function that generates sequence according the HMM
#note, that the length is undetermined - the sequence end when reaches "E" state
def generate_HMM(HMM_pattern):
    seq_states = [ "B" ]
    seq = [ "B" ]
    #next state and letter are chosen based on their probabilities
    while seq[-1] != 'E':
        next_state = np.random.choice(list(HMM_pattern.transition_probs[seq_states[-1]].keys()), p=list(HMM_pattern.transition_probs[seq_states[-1]].values()))
        next_letter = np.random.choice(list(HMM_pattern.outcomes_probs[next_state].keys()), p=list(HMM_pattern.outcomes_probs[next_state].values()))
        seq_states.append(next_state)
        seq.append(next_letter)
    return(''.join(seq))

np.random.seed(123) #set seed for reproducibility
generated_seq = generate_HMM(HMM_pattern)
generated_seq
Out[154]:
'BcgcacgtgagtgcagccctgacctgtgcggggaagtggaggctacttgccgaggcctggtagtgcatacgtgtattcatctattgataacattggggggaatggagtccattagaatatgcacgtccttcaE'
In [155]:
viterbi_myseq = viterbi(HMM_pattern, generated_seq)

for site in viterbi_myseq:
    print("Restriction site:", site, site+3)
Restriction site: 3 6
Restriction site: 67 70
Restriction site: 121 124
In [156]:
fb_myseq_ind, fb_myseq =  forward_backward(HMM_pattern, generated_seq)

labels = ["background", "S1", "S2", "S3", "S4"]
plt.figure(figsize=(10,6))
for i in range(len(fb_myseq)):
    plt.plot(range(1,len(fb_myseq[i])+1), fb_myseq[i], label=labels[i])

plt.xlabel('Position')
plt.ylabel('Value')
plt.legend()
plt.grid(True)
plt.show()

for site in fb_myseq_ind:
    print("Restriction site:", site, site+3)
Restriction site: 3 6
Restriction site: 67 70
Restriction site: 121 124

The results of the algorithms match, the both find 3 restriction site