import smith_watterman as sw
from testing import *
import os, sys
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# class to supress fuction output
# is used to run algorithm multiple times to know scores
class OnlyTest:
def __enter__(self):
self._original_stdout = sys.stdout
sys.stdout = open(os.devnull, 'w')
def __exit__(self, exc_type, exc_val, exc_tb):
sys.stdout.close()
sys.stdout = self._original_stdout
I used this website for self-checking http://rna.informatik.uni-freiburg.de/Teaching/index.jsp?toolName=Smith-Waterman
Transition matrix and alignments appear to be the same here and on the website
seq1 = random_seq(n=17)
seq2 = random_seq(n=14)
score = sw.water(seq1, seq2)
from collections import defaultdict
from tqdm import tqdm
with OnlyTest():
scores = defaultdict(list)
for t in ['mut', 'del']:
for i in tqdm(range(5)):
for _ in range(1000):
seq1 = random_seq(n=20)
if t == 'mut':
seq2 = change_seq(seq1, n=i, mutation=True)
else:
seq2 = change_seq(seq1, n=i, mutation=False)
scores[t + str(i+1)].append(sw.water(seq1, seq2))
df = pd.DataFrame(scores).melt()
sns.catplot(kind='box', data=df, x='variable', y='value', palette="Set3", aspect=3)
plt.xlabel('')
plt.ylabel('Alignment score')
plt.show()
The score decreases with the increase in the number of mutations and deletions. It's noticeable that in the case of mismatches dispersion is higher. Also, deletions cause a more dramatic decrease in the score.
def make_plots(scores):
df = pd.DataFrame(scores).melt()
df.loc[:,'coeff'] = df.value / df.variable
plt.figure(figsize=(16, 6))
plt.subplot(1, 2, 1)
sns.lineplot(data=df, x='variable', y='value')
plt.xlabel('Sequence length')
plt.ylabel('Alignment score')
plt.subplot(1, 2, 2)
sns.lineplot(data=df, x='variable', y='coeff')
plt.xlabel('Sequence length')
plt.ylabel('Alignment score / Sequence length')
plt.tight_layout()
# default parameters
with OnlyTest():
scores_length = defaultdict(list)
for n in tqdm(range(10, 100, 5)):
for _ in range(100):
seq1 = random_seq(n=n)
seq2 = random_seq(n=n)
scores_length[n].append(sw.water(seq1, seq2))
make_plots(scores_length)
# zero mismatch penalty
with OnlyTest():
scores_length = defaultdict(list)
for n in tqdm(range(10, 100, 5)):
for _ in range(100):
seq1 = random_seq(n=n)
seq2 = random_seq(n=n)
scores_length[n].append(sw.water(seq1, seq2, mismatch = 0))
make_plots(scores_length)
# big mismatch and gap penalty
with OnlyTest():
scores_length = defaultdict(list)
for n in tqdm(range(10, 100, 5)):
for _ in range(100):
seq1 = random_seq(n=n)
seq2 = random_seq(n=n)
scores_length[n].append(sw.water(seq1, seq2, mismatch = -100000, gap_penalty = -10000))
make_plots(scores_length)
Let's perfom simulations
We will suppose independence in out simulation, which actually doesn't exist. The effect still looks similar
alignment score / sequence length tends to Chvátal–Sankoff constants (https://en.wikipedia.org/wiki/Chv%C3%A1tal%E2%80%93Sankoff_constants)
for ind, k in enumerate(k_range):
for i in range(n_tries):
a = np.random.choice([0,1], size = (k, ), p = [0.75, 0.25])
max_ser = a.sum()
vals[ind, i] = max_ser
plt.plot(vals.mean(axis=1))
In the case of a large gap and mismatch penalty, the problem can be reformulated into the problem of the expected length of the longest sequence of successes in a series of Bernoulli trials. Let our coin be biased with probability $ p $. The random variable $ L_ {n} $ is the largest success sequence for $ n $ trials. The expected number of sequences of length greater than or equal to one can be found using the formula $ n (1-p) p $. The length is greater than or equal to $ k $, respectively, $ n (1-p) p ^ {k} $. We can approximate $ L_ {n} $: $$L_{n} \approx -log_{p} n(1-p) = log_{1/p} n(1-p)$$ Therefore $$ \frac{L_{n}}{log_{1/p} n(1-p)} \rightarrow 1$$ $$\forall \varepsilon > 0 \lim_{x\to \inf} P(| \frac{L_{n}}{log_{1/p} n(1-p)} - 1 | > \varepsilon) = 0 $$
k_range = np.arange(1, 1000)
n_tries = 1000
vals = np.zeros((k_range.shape[0], n_tries))
for ind, k in enumerate(k_range):
for i in range(n_tries):
a = np.random.choice([0,1], size = (k, ), p = [0.75, 0.25])
max_ser = 1
ser = 0
for j in range(1, a.shape[0]):
if a[j] == 1:
ser += 1
else:
if ser > max_ser:
max_ser = ser
ser = 0
if ser > max_ser:
max_ser = ser
vals[ind, i] = max_ser
plt.plot(vals.mean(axis=1))