In [50]:
import numpy as np

Smith Waterman algorithm with linear gap penalty¶

In [21]:
#Function that return the indices of all items with the highest value
def argmax_all(arr):
  max_value = max(arr)
  indices = [i for i, x in enumerate(arr) if x == max_value]
  return indices

#Function, that computes the scoring matrix (uses linear gap penalty)
def Smith_Waterman_linear(seq1, seq2, match_bonus, mismatch, deletion):
  matrix = [[ [0, []] for _ in range(len(seq1)+1)] for _ in range(len(seq2)+1)]

  #each cell of the matrix has the following form: [value, [possible parents]]
  #there may be several "parent" cells, so keep track of all of them

  for i in range(1, len(seq1)+1):
    for j in range(1, len(seq2)+1):

      mis_match = match_bonus if seq1[i-1] == seq2[j-1] else mismatch
      matrix[j][i][0]=max(matrix[j-1][i-1][0]+mis_match, matrix[j-1][i][0]+deletion, matrix[j][i-1][0]+deletion, 0)


      up_left = matrix[j-1][i-1][0]+mis_match
      up = matrix[j-1][i][0]+deletion
      left = matrix[j][i-1][0]+deletion

      parent = argmax_all([up_left, up, left])

      matrix[j][i][1]=parent

  return matrix

#Function finds the maximum value of the matrix (aka Score) and the coordinates of the said maximum, that represent the "beginnings" if the local alignements
def find_max(matrix):
  MAXs = []

  i_max = 0
  j_max = 0
  score = 0

  for i in range(1, len(matrix[0])):
    for j in range(1, len(matrix)):
      #there may be several cells with the same (maximum) score, so I keep all their coordinates in the list MAXs
      if matrix[j][i][0]>score:
        score = matrix[j][i][0]
        MAXs = [[i,j]]
      elif matrix[j][i][0]==score:
        MAXs.append([i,j])

  return MAXs, score

#using the matrix and the coordinates of the begnning, the function presents the alignements in human-readable format
def align_from_matrix(matrix, seq1, seq2, max_coord):

  i_now = max_coord[0]
  j_now = max_coord[1]

  alignements = []

  seq1_aligned = ''
  seq2_aligned = ''
  symbols_align = ''

  while matrix[j_now][i_now][0] != 0:
    if len(matrix[j_now][i_now][1])==1:
      if matrix[j_now][i_now][1][0] == 0: #up left
        seq1_aligned+=seq1[i_now-1]
        seq2_aligned+=seq2[j_now-1]
        if seq1[i_now-1]==seq2[j_now-1]:
          symbols_align+='|'
        else:
          symbols_align+=':'
        j_now-=1
        i_now-=1

      elif matrix[j_now][i_now][1][0] == 1: #up
        j_now-=1
        seq1_aligned+="-"
        seq2_aligned+=seq2[j_now]
        symbols_align+=' '

      elif matrix[j_now][i_now][1][0] == 2: #left
        i_now-=1
        seq1_aligned+=seq1[i_now]
        seq2_aligned+="-"
        symbols_align+=' '

    else:
      matrix_special = matrix
      for j in matrix[j_now][i_now][1]:
        matrix_special[j_now][i_now][1]=[j]
        align = align_from_matrix(matrix_special, seq1, seq2, max_coord)
        alignements+=align

  alignements.append([seq1_aligned[::-1], symbols_align[::-1], seq2_aligned[::-1]])

  return alignements

def unique_lists(list_of_lists):
  unique_set = set(tuple(sub_list) for sub_list in list_of_lists)
  unique_lists = [list(sub_list) for sub_list in unique_set]
  return unique_lists

#join all the said functions in one
def SW_linear_alignements(seq1, seq2, match_bonus, mismatch, deletion):
  matrix = Smith_Waterman_linear(seq1, seq2, match_bonus, mismatch, deletion)
  MAXs, score = find_max(matrix)
  alignements = []
  for i in MAXs:
    a = align_from_matrix(matrix, seq1, seq2, i)
    alignements+=a
  final = unique_lists(alignements)
  return score, final

def read_align(align):
  for i in range(len(align)):
    print("\n____ALIGN", i+1, "____")
    for j in align[i]:
        print(j)

Tests¶

All the 'self made' alignements were compared to the ones recieved with http://rna.informatik.uni-freiburg.de/Teaching/index.jsp?toolName=Smith-Waterman

Test 1:

The same sequenecs align completely

In [25]:
score, align = SW_linear_alignements("GATTTGGTTTCGTAACTTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 31

____ALIGN 1 ____
GATTTGGTTTCGTAACTTTCGAAAATAGTGC
|||||||||||||||||||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

Test 2:

Sequences with a couple of mismatches

Because the match_bonus and mismatch_pemalty have the same absolute value, it makes no difference whether to keep two last nucleotides (and get +1 for match and -1 for mismatch) as in the "ALIGN 1" or to skip them ("ALIGN 2")

In [26]:
score, align = SW_linear_alignements("GATCTGGTTTCGTAGCTTTCGAAAATAGTTC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 25

____ALIGN 1 ____
GATCTGGTTTCGTAGCTTTCGAAAATAGTTC
|||:||||||||||:||||||||||||||:|
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 2 ____
GATCTGGTTTCGTAGCTTTCGAAAATAGT
|||:||||||||||:||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGT

The changes of the parameters may tip the scales for either the first variant or the second one:

In [27]:
score, align = SW_linear_alignements("GATCTGGTTTCGTAGCTTTCGAAAATAGTTC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 2, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 53

____ALIGN 1 ____
GATCTGGTTTCGTAGCTTTCGAAAATAGTTC
|||:||||||||||:||||||||||||||:|
GATTTGGTTTCGTAACTTTCGAAAATAGTGC
In [29]:
score, align = SW_linear_alignements("GATCTGGTTTCGTAGCTTTCGAAAATAGTTC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -2, deletion = -5)
print("Score:", score)
read_align(align)
Score: 23

____ALIGN 1 ____
GATCTGGTTTCGTAGCTTTCGAAAATAGT
|||:||||||||||:||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGT

Test 3:

3.1 Deletion in the beginning/end of the sequence => the alignements are cropped, as expected

In [31]:
score, align = SW_linear_alignements("TCGTAACTTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 22

____ALIGN 1 ____
TCGTAACTTTCGAAAATAGTGC
||||||||||||||||||||||
TCGTAACTTTCGAAAATAGTGC

3.2 Deletion in the middle => depending on the length of the deletion and deletion penalty:

  • it may be "forgiven"

  • one of the halves may be outweighed by the deletion penalty and lost

short deletion, small penalty

In [33]:
score, align = SW_linear_alignements("GATTTGGTTTCGTATTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 27

____ALIGN 1 ____
GATTTGGTTTCGT-A-TTTCGAAAATAGTGC
||||||||||||| | |||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 2 ____
GATTTGGTTTCGTA--TTTCGAAAATAGTGC
||||||||||||||  |||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

small deletion, huge penalty

In [34]:
score, align = SW_linear_alignements("GATTTGGTTTCGTATTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -10)
print("Score:", score)
read_align(align)
Score: 15

____ALIGN 1 ____
TTTCGAAAATAGTGC
|||||||||||||||
TTTCGAAAATAGTGC

huge deletion

In [48]:
score, align = SW_linear_alignements("GATTTGGTTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 15

____ALIGN 1 ____
TTTCGAAAATAGTGC
|||||||||||||||
TTTCGAAAATAGTGC

Test 4:

The beginning of one sequence is the end of another

The same part aligns just fine =)

In [49]:
score, align = SW_linear_alignements("GGATTTGGTTTCGTAACTTTCGAAAATAGTG", "GAAAATAGTGCCGATGCGGACAGGGCGAGGGAGA", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 10

____ALIGN 1 ____
GAAAATAGTG
||||||||||
GAAAATAGTG

Test 5:

Checking the work of "find all the best alignements" part.

All the following alignements have the same score, because the gap penalty is linear - we see how "C-C" migrate through the indel.

Everything seems fine and coresponds to he "golden standart" from the web site.

In [22]:
score, align = SW_linear_alignements("TACCCCGGGCCCGCTAC", "TACGGGCCCGCTAC", match_bonus = 2, mismatch = -2, deletion = -1)
print("Score:", score)
read_align(align)
Score: 25

____ALIGN 1 ____
TACCCCGGGCCCGCTAC
||   ||||||||||||
TA---CGGGCCCGCTAC

____ALIGN 2 ____
TACCCCGGGCCCGCTAC
|| |  |||||||||||
TA-C--GGGCCCGCTAC

____ALIGN 3 ____
TACCCCGGGCCCGCTAC
||  | |||||||||||
TA--C-GGGCCCGCTAC

____ALIGN 4 ____
TACCCCGGGCCCGCTAC
|||   |||||||||||
TAC---GGGCCCGCTAC

In ambiguous cases there may be a lot of alignements with the same score

In [41]:
score, align = SW_linear_alignements("GATTTGGTTTCGTGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 17

____ALIGN 1 ____
GATTTGGTTTCGT-------GAAAATAGTGC
|||||||||||||       |||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 2 ____
GATTTGGTTTCG------T-GAAAATAGTGC
||||||||||||      | |||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 3 ____
GATTTGGTTTCG-----T--GAAAATAGTGC
||||||||||||     |  |||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 4 ____
GATTTGGTTTCG----T---GAAAATAGTGC
||||||||||||    |   |||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC
In [45]:
score, align = SW_linear_alignements("GATTTGGTTTCGTAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 15

____ALIGN 1 ____
GATTTGGTTTCGTAA------AA--TAGTGC
|||||||||||||||      ||  ||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 2 ____
GATTTGGTTTCGTAA------A-A-TAGTGC
|||||||||||||||      | | ||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 3 ____
GATTTGGTTTCGTAA------A--ATAGTGC
|||||||||||||||      |  |||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 4 ____
GATTTGGTTTCGTAA-------A-ATAGTGC
|||||||||||||||       | |||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 5 ____
GATTTGGTTTCG----T----AAAATAGTGC
||||||||||||    |    ||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 6 ____
GATTTGGTTTCG-----T---AAAATAGTGC
||||||||||||     |   ||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 7 ____
GATTTGGTTTCGT-A-------AAATAGTGC
||||||||||||| |       |||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 8 ____
GATTTGGTTTCGTA-------A-AATAGTGC
||||||||||||||       | ||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 9 ____
GATTTGGTTTCGTAA
|||||||||||||||
GATTTGGTTTCGTAA

____ALIGN 10 ____
GATTTGGTTTCGTA--------AAATAGTGC
||||||||||||||        |||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 11 ____
GATTTGGTTTCGTAA--------AATAGTGC
|||||||||||||||        ||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 12 ____
GATTTGGTTTCGT--------AAAATAGTGC
|||||||||||||        ||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 13 ____
GATTTGGTTTCG------T--AAAATAGTGC
||||||||||||      |  ||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

Smith Waterman algorithm with affine gap penalty¶

In [90]:
#all the function stay the same except the one that computes the matrix
def Smith_Waterman_affine(seq1, seq2, match_bonus, mismatch, del_open, del_ext):
  matrix = [[ [0, "parent"] for _ in range(len(seq1)+1)] for _ in range(len(seq2)+1)]

  i_max = 0
  j_max = 0
  num_max = 0

  for i in range(1, len(seq1)+1):
    for j in range(1, len(seq2)+1):

      mis_match = match_bonus if seq1[i-1] == seq2[j-1] else mismatch

      del_y = del_open if matrix[j-1][i][1][0]!=1 else del_ext
      del_x = del_open if matrix[j][i-1][1][0]!=2 else del_ext

      matrix[j][i][0]=max(matrix[j-1][i-1][0]+mis_match, matrix[j-1][i][0]+del_y, matrix[j][i-1][0]+del_x, 0)


      up_left = matrix[j-1][i-1][0]+mis_match
      up = matrix[j-1][i][0]+del_y
      left = matrix[j][i-1][0]+del_x

      #dir = ['up_left', 'up', 'left']
      parent = np.argmax([up_left, up, left])
      matrix[j][i][1]=[parent]

  return matrix

def SW_affine_alignements(seq1, seq2, match_bonus, mismatch, del_open, del_ext):
  matrix = Smith_Waterman_affine(seq1, seq2, match_bonus, mismatch, del_open, del_ext)
  MAXs, score = find_max(matrix)
  alignements = []
  for i in MAXs:
    a = align_from_matrix(matrix, seq1, seq2, i)
    alignements+=a
  final = unique_lists(alignements)
  return score, final

Tests¶

The alignements were compered with the ones recieved from http://rna.informatik.uni-freiburg.de/Teaching/index.jsp?toolName=Gotoh

Test 1:

Same sequences

In [92]:
score, align = SW_linear_alignements("GATTTGGTTTCGTAACTTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 31

____ALIGN 1 ____
GATTTGGTTTCGTAACTTTCGAAAATAGTGC
|||||||||||||||||||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

Test 2:

Mismatches

In [93]:
score, align = SW_linear_alignements("GATCTGGTTTCGTAGCTTTCGAAAATAGTTC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 25

____ALIGN 1 ____
GATCTGGTTTCGTAGCTTTCGAAAATAGTTC
|||:||||||||||:||||||||||||||:|
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 2 ____
GATCTGGTTTCGTAGCTTTCGAAAATAGT
|||:||||||||||:||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGT

Test 3:

Much more interesting, however, is how this algorithm deals with gaps - I will use the same examples for easier comparison.

When using linear penalty, it makes no difference wheather the gaps stick together or not, leading to a much higher number of "top" alignements. When the penalty is affine, the best alignement is the one where the gaps are side by side.

In [98]:
#linear gap penalty
score, align = SW_linear_alignements("GATTTGGTTTCGTATTTCGAAAATAGTGC", "GATTTGGTTTCGTATACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 26

____ALIGN 1 ____
GATTTGGTTTCG--TA-TTTCGAAAATAGTGC
||||||||||||  || |||||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC

____ALIGN 2 ____
GATTTGGTTTCGTAT--TT-CGAAAATAGTGC
|||||||||||||||  || ||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC

____ALIGN 3 ____
GATTTGGTTTCGTAT--T-TCGAAAATAGTGC
|||||||||||||||  | |||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC

____ALIGN 4 ____
GATTTGGTTTCGTA---TTTCGAAAATAGTGC
||||||||||||||   |||||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC

____ALIGN 5 ____
GATTTGGTTTCGT--A-TTTCGAAAATAGTGC
|||||||||||||  | |||||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC

____ALIGN 6 ____
GATTTGGTTTCGTAT---TTCGAAAATAGTGC
|||||||||||||||   ||||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC
In [99]:
#affine gap penalty
score, align = SW_affine_alignements("GATTTGGTTTCGTATTTCGAAAATAGTGC", "GATTTGGTTTCGTATACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, del_open = -5, del_ext = -1)
print("Score:", score)
read_align(align)
Score: 22

____ALIGN 1 ____
GATTTGGTTTCGTAT---TTCGAAAATAGTGC
|||||||||||||||   ||||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC

This seems to be the proof that the algorithm works adequately.

Distribution of scores¶

In [108]:
import random

#generate random 20 sequences (length 200)
seqs = []

nucl = ["A", "T", "G", "C"]
for k in range(20):
  seq = ''
  for symbol in range(200):
     seq += nucl[random.randint(0, len(nucl) - 1)]
  seqs.append(seq)

scores_linear = []
scores_affine = []

#Compute the scores with both algorithms
for i in range(len(seqs)):
  for j in range(i+1, len(seqs)):
    matrix_linear = Smith_Waterman_linear(seqs[i], seqs[j], match_bonus = 1, mismatch = -1, deletion=-1)
    m, score_linear = find_max(matrix_linear)
    scores_linear.append(score_linear)

    matrix_affine = Smith_Waterman_affine(seqs[i], seqs[j], match_bonus = 1, mismatch = -1, del_open = -5, del_ext=-1)
    m, score_affine = find_max(matrix_affine)
    scores_affine.append(score_affine)
In [113]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(scores_linear, color='orange', fill=True, label='linear penalty', binwidth=2)
sns.histplot(scores_affine, color='red', fill=True, label='affine penalty', binwidth=2)
plt.legend()
plt.xlabel("Score")
Out[113]:
Text(0.5, 0, 'Score')

Judging by the distribution, it can be concluded that affine penalty is stricter. The scores for random sequences are lower, so affine penalty should better filter out random alignments (alignements of non-homologous sequences) .