import numpy as np


#Function that return the indices of all items with the highest value
def argmax_all(arr):
  max_value = max(arr)
  indices = [i for i, x in enumerate(arr) if x == max_value]
  return indices

#Function, that computes the scoring matrix (uses linear gap penalty)
def Smith_Waterman_linear(seq1, seq2, match_bonus, mismatch, deletion):
  matrix = [[ [0, []] for _ in range(len(seq1)+1)] for _ in range(len(seq2)+1)]

  #each cell of the matrix has the following form: [value, [possible parents]]
  #there may be several "parent" cells, so keep track of all of them

  for i in range(1, len(seq1)+1):
    for j in range(1, len(seq2)+1):

      mis_match = match_bonus if seq1[i-1] == seq2[j-1] else mismatch
      matrix[j][i][0]=max(matrix[j-1][i-1][0]+mis_match, matrix[j-1][i][0]+deletion, matrix[j][i-1][0]+deletion, 0)


      up_left = matrix[j-1][i-1][0]+mis_match
      up = matrix[j-1][i][0]+deletion
      left = matrix[j][i-1][0]+deletion

      parent = argmax_all([up_left, up, left])

      matrix[j][i][1]=parent

  return matrix

#Function finds the maximum value of the matrix (aka Score) and the coordinates of the said maximum, that represent the "beginnings" if the local alignements
def find_max(matrix):
  MAXs = []

  i_max = 0
  j_max = 0
  score = 0

  for i in range(1, len(matrix[0])):
    for j in range(1, len(matrix)):
      #there may be several cells with the same (maximum) score, so I keep all their coordinates in the list MAXs
      if matrix[j][i][0]>score:
        score = matrix[j][i][0]
        MAXs = [[i,j]]
      elif matrix[j][i][0]==score:
        MAXs.append([i,j])

  return MAXs, score

#using the matrix and the coordinates of the begnning, the function presents the alignements in human-readable format
def align_from_matrix(matrix, seq1, seq2, max_coord):

  i_now = max_coord[0]
  j_now = max_coord[1]

  alignements = []

  seq1_aligned = ''
  seq2_aligned = ''
  symbols_align = ''

  while matrix[j_now][i_now][0] != 0:
    if len(matrix[j_now][i_now][1])==1:
      if matrix[j_now][i_now][1][0] == 0: #up left
        seq1_aligned+=seq1[i_now-1]
        seq2_aligned+=seq2[j_now-1]
        if seq1[i_now-1]==seq2[j_now-1]:
          symbols_align+='|'
        else:
          symbols_align+=':'
        j_now-=1
        i_now-=1

      elif matrix[j_now][i_now][1][0] == 1: #up
        j_now-=1
        seq1_aligned+="-"
        seq2_aligned+=seq2[j_now]
        symbols_align+=' '

      elif matrix[j_now][i_now][1][0] == 2: #left
        i_now-=1
        seq1_aligned+=seq1[i_now]
        seq2_aligned+="-"
        symbols_align+=' '

    else:
      matrix_special = matrix
      for j in matrix[j_now][i_now][1]:
        matrix_special[j_now][i_now][1]=[j]
        align = align_from_matrix(matrix_special, seq1, seq2, max_coord)
        alignements+=align

  alignements.append([seq1_aligned[::-1], symbols_align[::-1], seq2_aligned[::-1]])

  return alignements

def unique_lists(list_of_lists):
  unique_set = set(tuple(sub_list) for sub_list in list_of_lists)
  unique_lists = [list(sub_list) for sub_list in unique_set]
  return unique_lists

#join all the said functions in one
def SW_linear_alignements(seq1, seq2, match_bonus, mismatch, deletion):
  matrix = Smith_Waterman_linear(seq1, seq2, match_bonus, mismatch, deletion)
  MAXs, score = find_max(matrix)
  alignements = []
  for i in MAXs:
    a = align_from_matrix(matrix, seq1, seq2, i)
    alignements+=a
  final = unique_lists(alignements)
  return score, final

def read_align(align):
  for i in range(len(align)):
    print("\n____ALIGN", i+1, "____")
    for j in align[i]:
        print(j)


score, align = SW_linear_alignements("GATTTGGTTTCGTAACTTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)

Score: 31

____ALIGN 1 ____
GATTTGGTTTCGTAACTTTCGAAAATAGTGC
|||||||||||||||||||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC


score, align = SW_linear_alignements("GATCTGGTTTCGTAGCTTTCGAAAATAGTTC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)

Score: 25

____ALIGN 1 ____
GATCTGGTTTCGTAGCTTTCGAAAATAGTTC
|||:||||||||||:||||||||||||||:|
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 2 ____
GATCTGGTTTCGTAGCTTTCGAAAATAGT
|||:||||||||||:||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGT


score, align = SW_linear_alignements("GATCTGGTTTCGTAGCTTTCGAAAATAGTTC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 2, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)

Score: 53

____ALIGN 1 ____
GATCTGGTTTCGTAGCTTTCGAAAATAGTTC
|||:||||||||||:||||||||||||||:|
GATTTGGTTTCGTAACTTTCGAAAATAGTGC


score, align = SW_linear_alignements("GATCTGGTTTCGTAGCTTTCGAAAATAGTTC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -2, deletion = -5)
print("Score:", score)
read_align(align)

Score: 23

____ALIGN 1 ____
GATCTGGTTTCGTAGCTTTCGAAAATAGT
|||:||||||||||:||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGT


score, align = SW_linear_alignements("TCGTAACTTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)

Score: 22

____ALIGN 1 ____
TCGTAACTTTCGAAAATAGTGC
||||||||||||||||||||||
TCGTAACTTTCGAAAATAGTGC


score, align = SW_linear_alignements("GATTTGGTTTCGTATTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)

Score: 27

____ALIGN 1 ____
GATTTGGTTTCGT-A-TTTCGAAAATAGTGC
||||||||||||| | |||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 2 ____
GATTTGGTTTCGTA--TTTCGAAAATAGTGC
||||||||||||||  |||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC


score, align = SW_linear_alignements("GATTTGGTTTCGTATTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -10)
print("Score:", score)
read_align(align)

Score: 15

____ALIGN 1 ____
TTTCGAAAATAGTGC
|||||||||||||||
TTTCGAAAATAGTGC


score, align = SW_linear_alignements("GATTTGGTTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)

Score: 15

____ALIGN 1 ____
TTTCGAAAATAGTGC
|||||||||||||||
TTTCGAAAATAGTGC


score, align = SW_linear_alignements("GGATTTGGTTTCGTAACTTTCGAAAATAGTG", "GAAAATAGTGCCGATGCGGACAGGGCGAGGGAGA", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)

Score: 10

____ALIGN 1 ____
GAAAATAGTG
||||||||||
GAAAATAGTG


score, align = SW_linear_alignements("TACCCCGGGCCCGCTAC", "TACGGGCCCGCTAC", match_bonus = 2, mismatch = -2, deletion = -1)
print("Score:", score)
read_align(align)

Score: 25

____ALIGN 1 ____
TACCCCGGGCCCGCTAC
||   ||||||||||||
TA---CGGGCCCGCTAC

____ALIGN 2 ____
TACCCCGGGCCCGCTAC
|| |  |||||||||||
TA-C--GGGCCCGCTAC

____ALIGN 3 ____
TACCCCGGGCCCGCTAC
||  | |||||||||||
TA--C-GGGCCCGCTAC

____ALIGN 4 ____
TACCCCGGGCCCGCTAC
|||   |||||||||||
TAC---GGGCCCGCTAC


score, align = SW_linear_alignements("GATTTGGTTTCGTGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)

Score: 17

____ALIGN 1 ____
GATTTGGTTTCGT-------GAAAATAGTGC
|||||||||||||       |||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 2 ____
GATTTGGTTTCG------T-GAAAATAGTGC
||||||||||||      | |||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 3 ____
GATTTGGTTTCG-----T--GAAAATAGTGC
||||||||||||     |  |||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 4 ____
GATTTGGTTTCG----T---GAAAATAGTGC
||||||||||||    |   |||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC


score, align = SW_linear_alignements("GATTTGGTTTCGTAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)

Score: 15

____ALIGN 1 ____
GATTTGGTTTCGTAA------AA--TAGTGC
|||||||||||||||      ||  ||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 2 ____
GATTTGGTTTCGTAA------A-A-TAGTGC
|||||||||||||||      | | ||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 3 ____
GATTTGGTTTCGTAA------A--ATAGTGC
|||||||||||||||      |  |||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 4 ____
GATTTGGTTTCGTAA-------A-ATAGTGC
|||||||||||||||       | |||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 5 ____
GATTTGGTTTCG----T----AAAATAGTGC
||||||||||||    |    ||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 6 ____
GATTTGGTTTCG-----T---AAAATAGTGC
||||||||||||     |   ||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 7 ____
GATTTGGTTTCGT-A-------AAATAGTGC
||||||||||||| |       |||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 8 ____
GATTTGGTTTCGTA-------A-AATAGTGC
||||||||||||||       | ||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 9 ____
GATTTGGTTTCGTAA
|||||||||||||||
GATTTGGTTTCGTAA

____ALIGN 10 ____
GATTTGGTTTCGTA--------AAATAGTGC
||||||||||||||        |||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 11 ____
GATTTGGTTTCGTAA--------AATAGTGC
|||||||||||||||        ||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 12 ____
GATTTGGTTTCGT--------AAAATAGTGC
|||||||||||||        ||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 13 ____
GATTTGGTTTCG------T--AAAATAGTGC
||||||||||||      |  ||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC


#all the function stay the same except the one that computes the matrix
def Smith_Waterman_affine(seq1, seq2, match_bonus, mismatch, del_open, del_ext):
  matrix = [[ [0, "parent"] for _ in range(len(seq1)+1)] for _ in range(len(seq2)+1)]

  i_max = 0
  j_max = 0
  num_max = 0

  for i in range(1, len(seq1)+1):
    for j in range(1, len(seq2)+1):

      mis_match = match_bonus if seq1[i-1] == seq2[j-1] else mismatch

      del_y = del_open if matrix[j-1][i][1][0]!=1 else del_ext
      del_x = del_open if matrix[j][i-1][1][0]!=2 else del_ext

      matrix[j][i][0]=max(matrix[j-1][i-1][0]+mis_match, matrix[j-1][i][0]+del_y, matrix[j][i-1][0]+del_x, 0)


      up_left = matrix[j-1][i-1][0]+mis_match
      up = matrix[j-1][i][0]+del_y
      left = matrix[j][i-1][0]+del_x

      #dir = ['up_left', 'up', 'left']
      parent = np.argmax([up_left, up, left])
      matrix[j][i][1]=[parent]

  return matrix

def SW_affine_alignements(seq1, seq2, match_bonus, mismatch, del_open, del_ext):
  matrix = Smith_Waterman_affine(seq1, seq2, match_bonus, mismatch, del_open, del_ext)
  MAXs, score = find_max(matrix)
  alignements = []
  for i in MAXs:
    a = align_from_matrix(matrix, seq1, seq2, i)
    alignements+=a
  final = unique_lists(alignements)
  return score, final


score, align = SW_linear_alignements("GATTTGGTTTCGTAACTTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)

Score: 31

____ALIGN 1 ____
GATTTGGTTTCGTAACTTTCGAAAATAGTGC
|||||||||||||||||||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGTGC


score, align = SW_linear_alignements("GATCTGGTTTCGTAGCTTTCGAAAATAGTTC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)

Score: 25

____ALIGN 1 ____
GATCTGGTTTCGTAGCTTTCGAAAATAGTTC
|||:||||||||||:||||||||||||||:|
GATTTGGTTTCGTAACTTTCGAAAATAGTGC

____ALIGN 2 ____
GATCTGGTTTCGTAGCTTTCGAAAATAGT
|||:||||||||||:||||||||||||||
GATTTGGTTTCGTAACTTTCGAAAATAGT


#linear gap penalty
score, align = SW_linear_alignements("GATTTGGTTTCGTATTTCGAAAATAGTGC", "GATTTGGTTTCGTATACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)

Score: 26

____ALIGN 1 ____
GATTTGGTTTCG--TA-TTTCGAAAATAGTGC
||||||||||||  || |||||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC

____ALIGN 2 ____
GATTTGGTTTCGTAT--TT-CGAAAATAGTGC
|||||||||||||||  || ||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC

____ALIGN 3 ____
GATTTGGTTTCGTAT--T-TCGAAAATAGTGC
|||||||||||||||  | |||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC

____ALIGN 4 ____
GATTTGGTTTCGTA---TTTCGAAAATAGTGC
||||||||||||||   |||||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC

____ALIGN 5 ____
GATTTGGTTTCGT--A-TTTCGAAAATAGTGC
|||||||||||||  | |||||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC

____ALIGN 6 ____
GATTTGGTTTCGTAT---TTCGAAAATAGTGC
|||||||||||||||   ||||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC


#affine gap penalty
score, align = SW_affine_alignements("GATTTGGTTTCGTATTTCGAAAATAGTGC", "GATTTGGTTTCGTATACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, del_open = -5, del_ext = -1)
print("Score:", score)
read_align(align)

Score: 22

____ALIGN 1 ____
GATTTGGTTTCGTAT---TTCGAAAATAGTGC
|||||||||||||||   ||||||||||||||
GATTTGGTTTCGTATACTTTCGAAAATAGTGC


import random

#generate random 20 sequences (length 200)
seqs = []

nucl = ["A", "T", "G", "C"]
for k in range(20):
  seq = ''
  for symbol in range(200):
     seq += nucl[random.randint(0, len(nucl) - 1)]
  seqs.append(seq)

scores_linear = []
scores_affine = []

#Compute the scores with both algorithms
for i in range(len(seqs)):
  for j in range(i+1, len(seqs)):
    matrix_linear = Smith_Waterman_linear(seqs[i], seqs[j], match_bonus = 1, mismatch = -1, deletion=-1)
    m, score_linear = find_max(matrix_linear)
    scores_linear.append(score_linear)

    matrix_affine = Smith_Waterman_affine(seqs[i], seqs[j], match_bonus = 1, mismatch = -1, del_open = -5, del_ext=-1)
    m, score_affine = find_max(matrix_affine)
    scores_affine.append(score_affine)


import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(scores_linear, color='orange', fill=True, label='linear penalty', binwidth=2)
sns.histplot(scores_affine, color='red', fill=True, label='affine penalty', binwidth=2)
plt.legend()
plt.xlabel("Score")

Text(0.5, 0, 'Score')

Smith Waterman algorithm with linear gap penalty¶

Tests¶

Smith Waterman algorithm with affine gap penalty¶

Tests¶

Distribution of scores¶