Задание 1. Схема автомата для модели¶

restriction_site_automata.ong.png

In [ ]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
In [ ]:
test_seq = '''gataggattatcattcataagtttcagagcaatgtccttattctggaacttggatttatggctctt
ttggtttaatttcgcctgattcttgatctcctttagcttctcgacgtgggcctttttcttgccata
tggatccgctgcacggtcctgttccctagcatgtacgtgagcgtatttccttttaaaccacgacgc
tttgtcttcattcaacgtttcccattgtttttttctactattgctttgctgtgggaaaaacttatc
gaaagatgacgactttttcttaattctcgttttaagagcttggtgagcgctaggagtcactgccag'''
test_seq = ''.join(test_seq.split('\n'))
states = ['bg', 'S1', 'S2', 'S3', 'S4']
transition_matrix_test = np.array(
                         [[0.99, 0.01, 0, 0, 0],
                          [0, 0, 1, 0, 0],
                          [0, 0, 0, 1, 0],
                          [0, 0, 0, 0, 1],
                          [0.99, 0, 0, 0, 0]])
emission_matrices_test = np.array([[0.25, 0.25, 0.25, 0.25],
                                  [1, 0, 0, 0],
                                  [0, 1, 0, 0],
                                  [0, 0, 1, 0],
                                  [0, 0, 0, 1]])
initial_transitions_test = [0.99, 0.01, 0, 0, 0]
final_transitions_test = [0.01, 0, 0, 0, 0.01]

Задание 2. Алгоритм Витерби¶

In [ ]:
# Viterbi algorithm
def Vitebri(seq, states, transition_matrix, emission_matrices, initial=None, final=None):
  """
  Parameters
  ----------
  seq: str
    A protein or nucleotide sequence (recommended)
  states: 1d array
    A list of states, including the initial and the final states. The initial and
    the final states should not be included.
  transition_matrix: 2d array
    A probability matrix for transitions to all the hidden states. The probabilities
    should correspond to the states listed in "states" variable
  emission_matrices: 2d array
    A matrix for probabilities of observations given a hidden state (excluding
    the initial and the final states). The probabilities should correspond to the
    sequence alphabet in alphabetical order
  initial: 1d array, optional
    A list of probabilities for transitioning from the initial state. Default
    probabilities are equal for every hidden state
  final: 1d array, optional
    A list of probabilities for transitioning to the final state. Default
    probabilities are equal for every hidden state
  """

  # if no optional parameters were given
  if not initial:
    initial = [1/len(states) for i in states]
  if not final:
    final = [1/len(states) for i in states]

  # creating a n_states * seq_length matrix for interim likelihoods
  alphabet = sorted(list(set(seq)))
  dyn_matrix = np.array([[0.0 for i in range(len(seq))] for j in range(len(states))])
  letter = alphabet.index(seq[0])

  # creating a n_states * se_length matrix for an optimal path
  path = [['' for i in seq] for j in states]

  # calculating probabilities for the first letter
  for state in range(len(states)):
    prob = initial[state] * emission_matrices_test[state][letter]
    dyn_matrix[state][0] = prob
  prev_letter = letter

  # Calculating dyn_matrix[state][i] for every state; choose maximum value out
    # of all previous scores multiplied by the probability of transitioning from
    # the previous to current state and a corresponding probability from the
    # emission matrix
  for i in range(1, len(seq)):
    letter = alphabet.index(seq[i])
    for state in range(len(states)):
      for prev_state in range(len(states)):
        lh = dyn_matrix[prev_state][i - 1] * transition_matrix[prev_state][state] * emission_matrices[state][prev_letter]
        if lh > dyn_matrix[state][i]:
          dyn_matrix[state][i] = lh
          path[state][i] = prev_state
    prev_letter = letter

  # calculating final likelihoods
  for state in range(len(states)):
    dyn_matrix[state][-1] *= final[state]
  prev_letter = letter

  # Reverse traversal to find a path with the maximum likelihood
  prev_state = np.argmax([x[-1] for x in dyn_matrix])
  print(f'Likelihood of an optimal path is {np.max([x[-1] for x in dyn_matrix])}')
  reversed_path = []
  reversed_path.append(prev_state)
  for i in range(len(seq) - 1, 0, -1):
    prev_state = path[prev_state][i]
    reversed_path.append(prev_state)
  return reversed_path[::-1]
In [ ]:
def find_sites(opt_path, site_states, site=''):
  '''
  Parameters:
  opt_path: 1d array
    Optimal path found by Vitebri algorithm
  states: 1d array
    An array of consecutive hidden states corresponding to the restriction site
  site: str, optional
    A sequence of your site
  -----------
  '''
  cnt = 0
  for i, st in enumerate(opt_path):
    if st == site_states[cnt] - 1:
      cnt += 1
    if cnt == len(site_states):
      print(f'Found restriction site {site} in position {i - cnt + 1} - {i}')
      cnt = 0
In [ ]:
res1 = Vitebri(test_seq, states, transition_matrix_test,
               emission_matrices_test, initial_transitions_test,
               final_transitions_test)
Likelihood of an optimal path is 1.4352073166535307e-201
In [ ]:
find_sites(res1, [1, 2, 3, 4], 'ACGT')
Found restriction site ACGT in position 109 - 112
Found restriction site ACGT in position 166 - 169
Found restriction site ACGT in position 212 - 215

Проверим, совпадают ли результаты с обычным поиском по строке (а они должны).

In [ ]:
import re
[x.span() for x in re.finditer('acgt', test_seq)]
Out[ ]:
[(109, 113), (166, 170), (212, 216)]

Теперь изменим вероятность перехода из фона в сайт с 0.01 на 0.001.

In [ ]:
transition_matrix_test[0][1] = 0.001

res1_1 = Vitebri(test_seq, states, transition_matrix_test,
               emission_matrices_test, initial_transitions_test,
               final_transitions_test)

transition_matrix_test[0][1] = 0.01


find_sites(res1_1, [1, 2, 3, 4], 'ACGT')
Likelihood of an optimal path is 7.582581360685768e-203

Как видно, ничего не нашлось. Наиболее оптимальный путь в рамках данной модели не содержал в себе переходы в сайты рестрикции: они были приняты за фоновую модель.

Задание 3. Алгоритм Forward-backward¶

In [ ]:
# Forward-backward algorigthm
def forward_backward(seq, states, transition_matrix, emission_matrices, initial=None, final=None):
  '''
  Refer to Vitebri for parameter description
  '''

 # if no optional parameters were given
  if not initial:
    initial = [1/len(states) for i in states]
  if not final:
    final = [1/len(states) for i in states]

  # Forward step
  forward_probabilities = np.array([[0.0 for i in range(len(seq))] for j in range(len(states))])
  alphabet = sorted(list(set(seq)))
  letter = alphabet.index(seq[0])

  # Calculating first probabilities
  for state in range(len(states)):
    forward_probabilities[state][0] = initial[state]

  # Calculating forward probability by summing previous forward probabilities
  # multiplied by transition probability (prev_state -> cur_state) and multiplying
  # it by a corresponding emission probability given a current state
  for i in range(1, len(seq)):
    letter = alphabet.index(seq[i])
    for state in range(len(states)):
      for prev_state in range(len(states)):
        forward_probabilities[state][i] += forward_probabilities[prev_state][i - 1] * transition_matrix[prev_state][state]
      forward_probabilities[state][i] *= emission_matrices[state][letter]

  # Calculating final state probabilities
  for state in range(len(states)):
    forward_probabilities[state][-1] *= final[state]

  # Backward step
  backward_probabilities = np.array([[0.0 for i in range(len(seq))] for j in range(len(states))])

  # Calculating final state probabilities
  for state in range(len(states)):
    backward_probabilities[state][-1] = final[state]
  # Calculating these probabilities is basically the same as the forward part
  for i in range(len(seq) - 2, -1, -1):
    letter = alphabet.index(seq[i + 1])
    for prev_state in range(len(states)):
      for state in range(len(states)):
        backward_probabilities[state][i] += backward_probabilities[prev_state][i + 1] * transition_matrix[state][prev_state] * emission_matrices[prev_state][letter]
  # Calculating initial state probabilities
  for state in range(len(states)):
    backward_probabilities[state][0] *= initial[state]

  full_seq_prob = sum((x[-1] for x in forward_probabilities))

  # Calculating aposterior probabilities
  apost = forward_probabilities * backward_probabilities / full_seq_prob

  return apost
In [ ]:
res2 = forward_backward(test_seq, states, transition_matrix_test,
               emission_matrices_test, initial_transitions_test,
               final_transitions_test)

При построении графика можем заменить резкие падения в вероятностях для фонового состояния в местах, где находятся сайты рестрикции (их обнаружено 3, как и в предыдущем случае).

In [ ]:
def draw_task2(res):
  fig, ax = plt.subplots()
  for state in range(len(res)):
    plt.plot(res[state], label=states[state])
  ax.set_title('Restriction sites found by forward-backward algorithm')
  plt.legend()
  plt.show()

draw_task2(res2)
No description has been provided for this image

Попробуем поменять вероятности перехода из фона в сайт.Сначала увеличим ее до 0.1 (изначальное значение 0.01), чтобы увидеть заметные изменения. И действительно, вероятность для состояний, соответствующих сайту рестрикции, теперь выше.

In [ ]:
transition_matrix_test[0][1] = 0.1

res2_1 = forward_backward(test_seq, states, transition_matrix_test,
               emission_matrices_test, initial_transitions_test,
               final_transitions_test)

transition_matrix_test[0][1] = 0.01

draw_task2(res2_1)
No description has been provided for this image

В противоположной ситуации (при замене указанного значения с 0.01 на 0.001) видно, что вероятности, которые указывают на нахождения рестриктных сайтов, становятся меньше. Если уменьшать значение вероятности перехода из фона в сайт еще дальше, рестриктные сайты будут приняты за фон (например, для значения 0.00001)

In [ ]:
transition_matrix_test[0][1] = 0.001

res2_1 = forward_backward(test_seq, states, transition_matrix_test,
               emission_matrices_test, initial_transitions_test,
               final_transitions_test)

transition_matrix_test[0][1] = 0.01
draw_task2(res2_1)
No description has been provided for this image
In [ ]:
transition_matrix_test[0][1] = 0.00001
res2_1 = forward_backward(test_seq, states, transition_matrix_test,
               emission_matrices_test, initial_transitions_test,
               final_transitions_test)

transition_matrix_test[0][1] = 0.01

draw_task2(res2_1)
No description has been provided for this image

Задание 4. Генерация последовательности с сайтами рестрикции¶

In [ ]:
# Sequence generation
def generate_hmm_seq(alphabet, states, transition_matrix, emission_matrices, initial=None, final=None):
  '''
  Parameters:
  -----------
  length: int, optional
    length of a generated sequence
  For the rest refer to Vitebri for parameter description
  '''
 # if no optional parameters were given
  if not initial:
    initial = [1/len(states) for i in states]
  if not final:
    final = [1/len(states) for i in states]
  sequence = ''
  all_states = list(np.append(np.insert(states, 0, 'Beg'), 'End'))
  state = states.index(np.random.choice(states, p=initial))
  prev_state = state
  while all_states[state] != 'End':
    state = states.index(np.random.choice(states, p=transition_matrix[prev_state]))
    letter = np.random.choice(alphabet, p=emission_matrices[state])
    all_probs =  np.append(np.insert(transition_matrix[state], 0, 0), final[state])
    all_probs[1] -= final[state]
    end = np.random.choice(all_states, p=all_probs)
    if end == 'End':
      state = all_states.index(end)
    sequence += letter
  return sequence
In [ ]:
res_3 = generate_hmm_seq(['A', 'C', 'G', 'T'], states, transition_matrix_test,
               emission_matrices_test, initial_transitions_test,
               final_transitions_test)
res_3
Out[ ]:
'ATGTTAGTCGACTAGAGACTGCTACATAGGACTACAGTTCGACATAAAATCGTTCGGCTTTCACTAGTCCCCGCACCAGCCTAAGGGTTACGTCTATAAGGCGTAGTACCCCTCTCAGACGTGAAGGAAGGGTGATCGCTTTGTAATAGCCAGACCCGCGCGGGTAAATAGCGCGTGTTAGGATCGCAGAGCTTCG'

В сгенерированной случайной последовательности обнаружены 2 сайта рестрикции при помощи алгоритма Витебри и алгоритма "Вперед-назад".

In [ ]:
res3_1 = forward_backward(res_3, states, transition_matrix_test,
               emission_matrices_test, initial_transitions_test,
               final_transitions_test)

draw_task2(res3_1)
No description has been provided for this image
In [ ]:
res3_2 = Vitebri(res_3, states, transition_matrix_test,
               emission_matrices_test, initial_transitions_test,
               final_transitions_test)
find_sites(res3_2, [1, 2, 3, 4], 'ACGT')
Likelihood of an optimal path is 9.820604350736465e-121
Found restriction site ACGT in position 89 - 92
Found restriction site ACGT in position 118 - 121