import numpy as np
#Function that return the indices of all items with the highest value
def argmax_all(arr):
max_value = max(arr)
indices = [i for i, x in enumerate(arr) if x == max_value]
return indices
#Function, that computes the scoring matrix (uses linear gap penalty)
def Smith_Waterman_linear(seq1, seq2, match_bonus, mismatch, deletion):
matrix = [[ [0, []] for _ in range(len(seq1)+1)] for _ in range(len(seq2)+1)]
#each cell of the matrix has the following form: [value, [possible parents]]
#there may be several "parent" cells, so keep track of all of them
for i in range(1, len(seq1)+1):
for j in range(1, len(seq2)+1):
mis_match = match_bonus if seq1[i-1] == seq2[j-1] else mismatch
matrix[j][i][0]=max(matrix[j-1][i-1][0]+mis_match, matrix[j-1][i][0]+deletion, matrix[j][i-1][0]+deletion, 0)
up_left = matrix[j-1][i-1][0]+mis_match
up = matrix[j-1][i][0]+deletion
left = matrix[j][i-1][0]+deletion
parent = argmax_all([up_left, up, left])
matrix[j][i][1]=parent
return matrix
#Function finds the maximum value of the matrix (aka Score) and the coordinates of the said maximum, that represent the "beginnings" if the local alignements
def find_max(matrix):
MAXs = []
i_max = 0
j_max = 0
score = 0
for i in range(1, len(matrix[0])):
for j in range(1, len(matrix)):
#there may be several cells with the same (maximum) score, so I keep all their coordinates in the list MAXs
if matrix[j][i][0]>score:
score = matrix[j][i][0]
MAXs = [[i,j]]
elif matrix[j][i][0]==score:
MAXs.append([i,j])
return MAXs, score
#using the matrix and the coordinates of the begnning, the function presents the alignements in human-readable format
def align_from_matrix(matrix, seq1, seq2, max_coord):
i_now = max_coord[0]
j_now = max_coord[1]
alignements = []
seq1_aligned = ''
seq2_aligned = ''
symbols_align = ''
while matrix[j_now][i_now][0] != 0:
if len(matrix[j_now][i_now][1])==1:
if matrix[j_now][i_now][1][0] == 0: #up left
seq1_aligned+=seq1[i_now-1]
seq2_aligned+=seq2[j_now-1]
if seq1[i_now-1]==seq2[j_now-1]:
symbols_align+='|'
else:
symbols_align+=':'
j_now-=1
i_now-=1
elif matrix[j_now][i_now][1][0] == 1: #up
j_now-=1
seq1_aligned+="-"
seq2_aligned+=seq2[j_now]
symbols_align+=' '
elif matrix[j_now][i_now][1][0] == 2: #left
i_now-=1
seq1_aligned+=seq1[i_now]
seq2_aligned+="-"
symbols_align+=' '
else:
matrix_special = matrix
for j in matrix[j_now][i_now][1]:
matrix_special[j_now][i_now][1]=[j]
align = align_from_matrix(matrix_special, seq1, seq2, max_coord)
alignements+=align
alignements.append([seq1_aligned[::-1], symbols_align[::-1], seq2_aligned[::-1]])
return alignements
def unique_lists(list_of_lists):
unique_set = set(tuple(sub_list) for sub_list in list_of_lists)
unique_lists = [list(sub_list) for sub_list in unique_set]
return unique_lists
#join all the said functions in one
def SW_linear_alignements(seq1, seq2, match_bonus, mismatch, deletion):
matrix = Smith_Waterman_linear(seq1, seq2, match_bonus, mismatch, deletion)
MAXs, score = find_max(matrix)
alignements = []
for i in MAXs:
a = align_from_matrix(matrix, seq1, seq2, i)
alignements+=a
final = unique_lists(alignements)
return score, final
def read_align(align):
for i in range(len(align)):
print("\n____ALIGN", i+1, "____")
for j in align[i]:
print(j)
All the 'self made' alignements were compared to the ones recieved with http://rna.informatik.uni-freiburg.de/Teaching/index.jsp?toolName=Smith-Waterman
Test 1:
The same sequenecs align completely
score, align = SW_linear_alignements("GATTTGGTTTCGTAACTTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 31 ____ALIGN 1 ____ GATTTGGTTTCGTAACTTTCGAAAATAGTGC ||||||||||||||||||||||||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC
Test 2:
Sequences with a couple of mismatches
Because the match_bonus and mismatch_pemalty have the same absolute value, it makes no difference whether to keep two last nucleotides (and get +1 for match and -1 for mismatch) as in the "ALIGN 1" or to skip them ("ALIGN 2")
score, align = SW_linear_alignements("GATCTGGTTTCGTAGCTTTCGAAAATAGTTC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 25 ____ALIGN 1 ____ GATCTGGTTTCGTAGCTTTCGAAAATAGTTC |||:||||||||||:||||||||||||||:| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 2 ____ GATCTGGTTTCGTAGCTTTCGAAAATAGT |||:||||||||||:|||||||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGT
The changes of the parameters may tip the scales for either the first variant or the second one:
score, align = SW_linear_alignements("GATCTGGTTTCGTAGCTTTCGAAAATAGTTC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 2, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 53 ____ALIGN 1 ____ GATCTGGTTTCGTAGCTTTCGAAAATAGTTC |||:||||||||||:||||||||||||||:| GATTTGGTTTCGTAACTTTCGAAAATAGTGC
score, align = SW_linear_alignements("GATCTGGTTTCGTAGCTTTCGAAAATAGTTC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -2, deletion = -5)
print("Score:", score)
read_align(align)
Score: 23 ____ALIGN 1 ____ GATCTGGTTTCGTAGCTTTCGAAAATAGT |||:||||||||||:|||||||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGT
Test 3:
3.1 Deletion in the beginning/end of the sequence => the alignements are cropped, as expected
score, align = SW_linear_alignements("TCGTAACTTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 22 ____ALIGN 1 ____ TCGTAACTTTCGAAAATAGTGC |||||||||||||||||||||| TCGTAACTTTCGAAAATAGTGC
3.2 Deletion in the middle => depending on the length of the deletion and deletion penalty:
it may be "forgiven"
one of the halves may be outweighed by the deletion penalty and lost
short deletion, small penalty
score, align = SW_linear_alignements("GATTTGGTTTCGTATTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 27 ____ALIGN 1 ____ GATTTGGTTTCGT-A-TTTCGAAAATAGTGC ||||||||||||| | ||||||||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 2 ____ GATTTGGTTTCGTA--TTTCGAAAATAGTGC |||||||||||||| ||||||||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC
small deletion, huge penalty
score, align = SW_linear_alignements("GATTTGGTTTCGTATTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -10)
print("Score:", score)
read_align(align)
Score: 15 ____ALIGN 1 ____ TTTCGAAAATAGTGC ||||||||||||||| TTTCGAAAATAGTGC
huge deletion
score, align = SW_linear_alignements("GATTTGGTTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 15 ____ALIGN 1 ____ TTTCGAAAATAGTGC ||||||||||||||| TTTCGAAAATAGTGC
Test 4:
The beginning of one sequence is the end of another
The same part aligns just fine =)
score, align = SW_linear_alignements("GGATTTGGTTTCGTAACTTTCGAAAATAGTG", "GAAAATAGTGCCGATGCGGACAGGGCGAGGGAGA", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 10 ____ALIGN 1 ____ GAAAATAGTG |||||||||| GAAAATAGTG
Test 5:
Checking the work of "find all the best alignements" part.
All the following alignements have the same score, because the gap penalty is linear - we see how "C-C" migrate through the indel.
Everything seems fine and coresponds to he "golden standart" from the web site.
score, align = SW_linear_alignements("TACCCCGGGCCCGCTAC", "TACGGGCCCGCTAC", match_bonus = 2, mismatch = -2, deletion = -1)
print("Score:", score)
read_align(align)
Score: 25 ____ALIGN 1 ____ TACCCCGGGCCCGCTAC || |||||||||||| TA---CGGGCCCGCTAC ____ALIGN 2 ____ TACCCCGGGCCCGCTAC || | ||||||||||| TA-C--GGGCCCGCTAC ____ALIGN 3 ____ TACCCCGGGCCCGCTAC || | ||||||||||| TA--C-GGGCCCGCTAC ____ALIGN 4 ____ TACCCCGGGCCCGCTAC ||| ||||||||||| TAC---GGGCCCGCTAC
In ambiguous cases there may be a lot of alignements with the same score
score, align = SW_linear_alignements("GATTTGGTTTCGTGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 17 ____ALIGN 1 ____ GATTTGGTTTCGT-------GAAAATAGTGC ||||||||||||| ||||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 2 ____ GATTTGGTTTCG------T-GAAAATAGTGC |||||||||||| | ||||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 3 ____ GATTTGGTTTCG-----T--GAAAATAGTGC |||||||||||| | ||||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 4 ____ GATTTGGTTTCG----T---GAAAATAGTGC |||||||||||| | ||||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC
score, align = SW_linear_alignements("GATTTGGTTTCGTAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 15 ____ALIGN 1 ____ GATTTGGTTTCGTAA------AA--TAGTGC ||||||||||||||| || |||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 2 ____ GATTTGGTTTCGTAA------A-A-TAGTGC ||||||||||||||| | | |||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 3 ____ GATTTGGTTTCGTAA------A--ATAGTGC ||||||||||||||| | ||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 4 ____ GATTTGGTTTCGTAA-------A-ATAGTGC ||||||||||||||| | ||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 5 ____ GATTTGGTTTCG----T----AAAATAGTGC |||||||||||| | |||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 6 ____ GATTTGGTTTCG-----T---AAAATAGTGC |||||||||||| | |||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 7 ____ GATTTGGTTTCGT-A-------AAATAGTGC ||||||||||||| | ||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 8 ____ GATTTGGTTTCGTA-------A-AATAGTGC |||||||||||||| | |||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 9 ____ GATTTGGTTTCGTAA ||||||||||||||| GATTTGGTTTCGTAA ____ALIGN 10 ____ GATTTGGTTTCGTA--------AAATAGTGC |||||||||||||| ||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 11 ____ GATTTGGTTTCGTAA--------AATAGTGC ||||||||||||||| |||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 12 ____ GATTTGGTTTCGT--------AAAATAGTGC ||||||||||||| |||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 13 ____ GATTTGGTTTCG------T--AAAATAGTGC |||||||||||| | |||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC
#all the function stay the same except the one that computes the matrix
def Smith_Waterman_affine(seq1, seq2, match_bonus, mismatch, del_open, del_ext):
matrix = [[ [0, "parent"] for _ in range(len(seq1)+1)] for _ in range(len(seq2)+1)]
i_max = 0
j_max = 0
num_max = 0
for i in range(1, len(seq1)+1):
for j in range(1, len(seq2)+1):
mis_match = match_bonus if seq1[i-1] == seq2[j-1] else mismatch
del_y = del_open if matrix[j-1][i][1][0]!=1 else del_ext
del_x = del_open if matrix[j][i-1][1][0]!=2 else del_ext
matrix[j][i][0]=max(matrix[j-1][i-1][0]+mis_match, matrix[j-1][i][0]+del_y, matrix[j][i-1][0]+del_x, 0)
up_left = matrix[j-1][i-1][0]+mis_match
up = matrix[j-1][i][0]+del_y
left = matrix[j][i-1][0]+del_x
#dir = ['up_left', 'up', 'left']
parent = np.argmax([up_left, up, left])
matrix[j][i][1]=[parent]
return matrix
def SW_affine_alignements(seq1, seq2, match_bonus, mismatch, del_open, del_ext):
matrix = Smith_Waterman_affine(seq1, seq2, match_bonus, mismatch, del_open, del_ext)
MAXs, score = find_max(matrix)
alignements = []
for i in MAXs:
a = align_from_matrix(matrix, seq1, seq2, i)
alignements+=a
final = unique_lists(alignements)
return score, final
The alignements were compered with the ones recieved from http://rna.informatik.uni-freiburg.de/Teaching/index.jsp?toolName=Gotoh
Test 1:
Same sequences
score, align = SW_linear_alignements("GATTTGGTTTCGTAACTTTCGAAAATAGTGC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 31 ____ALIGN 1 ____ GATTTGGTTTCGTAACTTTCGAAAATAGTGC ||||||||||||||||||||||||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGTGC
Test 2:
Mismatches
score, align = SW_linear_alignements("GATCTGGTTTCGTAGCTTTCGAAAATAGTTC", "GATTTGGTTTCGTAACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 25 ____ALIGN 1 ____ GATCTGGTTTCGTAGCTTTCGAAAATAGTTC |||:||||||||||:||||||||||||||:| GATTTGGTTTCGTAACTTTCGAAAATAGTGC ____ALIGN 2 ____ GATCTGGTTTCGTAGCTTTCGAAAATAGT |||:||||||||||:|||||||||||||| GATTTGGTTTCGTAACTTTCGAAAATAGT
Test 3:
Much more interesting, however, is how this algorithm deals with gaps - I will use the same examples for easier comparison.
When using linear penalty, it makes no difference wheather the gaps stick together or not, leading to a much higher number of "top" alignements. When the penalty is affine, the best alignement is the one where the gaps are side by side.
#linear gap penalty
score, align = SW_linear_alignements("GATTTGGTTTCGTATTTCGAAAATAGTGC", "GATTTGGTTTCGTATACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, deletion = -1)
print("Score:", score)
read_align(align)
Score: 26 ____ALIGN 1 ____ GATTTGGTTTCG--TA-TTTCGAAAATAGTGC |||||||||||| || ||||||||||||||| GATTTGGTTTCGTATACTTTCGAAAATAGTGC ____ALIGN 2 ____ GATTTGGTTTCGTAT--TT-CGAAAATAGTGC ||||||||||||||| || |||||||||||| GATTTGGTTTCGTATACTTTCGAAAATAGTGC ____ALIGN 3 ____ GATTTGGTTTCGTAT--T-TCGAAAATAGTGC ||||||||||||||| | ||||||||||||| GATTTGGTTTCGTATACTTTCGAAAATAGTGC ____ALIGN 4 ____ GATTTGGTTTCGTA---TTTCGAAAATAGTGC |||||||||||||| ||||||||||||||| GATTTGGTTTCGTATACTTTCGAAAATAGTGC ____ALIGN 5 ____ GATTTGGTTTCGT--A-TTTCGAAAATAGTGC ||||||||||||| | ||||||||||||||| GATTTGGTTTCGTATACTTTCGAAAATAGTGC ____ALIGN 6 ____ GATTTGGTTTCGTAT---TTCGAAAATAGTGC ||||||||||||||| |||||||||||||| GATTTGGTTTCGTATACTTTCGAAAATAGTGC
#affine gap penalty
score, align = SW_affine_alignements("GATTTGGTTTCGTATTTCGAAAATAGTGC", "GATTTGGTTTCGTATACTTTCGAAAATAGTGC", match_bonus = 1, mismatch = -1, del_open = -5, del_ext = -1)
print("Score:", score)
read_align(align)
Score: 22 ____ALIGN 1 ____ GATTTGGTTTCGTAT---TTCGAAAATAGTGC ||||||||||||||| |||||||||||||| GATTTGGTTTCGTATACTTTCGAAAATAGTGC
This seems to be the proof that the algorithm works adequately.
import random
#generate random 20 sequences (length 200)
seqs = []
nucl = ["A", "T", "G", "C"]
for k in range(20):
seq = ''
for symbol in range(200):
seq += nucl[random.randint(0, len(nucl) - 1)]
seqs.append(seq)
scores_linear = []
scores_affine = []
#Compute the scores with both algorithms
for i in range(len(seqs)):
for j in range(i+1, len(seqs)):
matrix_linear = Smith_Waterman_linear(seqs[i], seqs[j], match_bonus = 1, mismatch = -1, deletion=-1)
m, score_linear = find_max(matrix_linear)
scores_linear.append(score_linear)
matrix_affine = Smith_Waterman_affine(seqs[i], seqs[j], match_bonus = 1, mismatch = -1, del_open = -5, del_ext=-1)
m, score_affine = find_max(matrix_affine)
scores_affine.append(score_affine)
import matplotlib.pyplot as plt
import seaborn as sns
sns.histplot(scores_linear, color='orange', fill=True, label='linear penalty', binwidth=2)
sns.histplot(scores_affine, color='red', fill=True, label='affine penalty', binwidth=2)
plt.legend()
plt.xlabel("Score")
Text(0.5, 0, 'Score')
Judging by the distribution, it can be concluded that affine penalty is stricter. The scores for random sequences are lower, so affine penalty should better filter out random alignments (alignements of non-homologous sequences) .