In [2]:
import random
import requests
import sys
import math
from statsmodels.stats.weightstats import ttest_ind as ttest
import matplotlib.pyplot as plt
import numpy as np

#generate random set of Kozak sequences

with open('human-genes.tsv', 'r') as file:
    lines = file.readlines()[1:] # remove the table header
with open('learn.fasta', 'w') as f, open('test.fasta', 'w') as g:
    
    random.shuffle(lines)
    count = 0
    errorcnt = 0
    for index, line in enumerate(lines):
        b = line.split("\t")
        if b[4] == "+":
            ext = "https://rest.ensembl.org" +"/sequence/region/human/" + b[0][3:] + ":" + str(int(b[6]) - 6) + ".." + str(int(b[6]) + 6) + ":1?expand_3prime=0;expand_5prime=0"
        elif b[4] == "-":
            ext = "https://rest.ensembl.org" + "/sequence/region/human/" + b[0][3:] + ":" + str(int(b[7]) - 5) + ".." + str(int(b[7]) + 7) + ":-1?expand_3prime=0;expand_5prime=0"
        try:
            r = requests.get(ext, headers={"Content-Type": "text/x-fasta"}) # download seq
        except (requests.ConnectionError, requests.Timeout): # in case server doesn't answer
            errorcnt += 1
            if errorcnt < 50: # in case server doesn't answer very often
                continue
            else:
                break
        if not r.ok: # for HTTPerror, not sure
            r.raise_for_status()
            sys.exit()
        if r.text[-7:-4] == "ATG" :
            count += 1
            if count <= 500: #first 500 seq for learning 
                f.write(r.text)
            elif count <= 1000: #next 500 seq for test
                g.write(r.text)
        if count == 1000:
            break
lines = []

#generate pseudo Kozak sequences

chr_num = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']
chr_len = [248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, 138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, 83257441, 80373285, 58617616, 64444167, 46709983, 50818468, 156040895, 57227415]
chr_adit = [248956422, 491149951, 689445510, 879660065, 1061198324, 1232004303, 1391350276, 1536488912, 1674883629, 1808681051, 1943767673, 2077042982, 2191407310, 2298451028, 2400442217, 2490780562, 2574038003, 2654411288, 2713028904, 2777473071, 2824183054, 2875001522, 3031042417, 3088269832]
count = 0
with open('negative.fasta', 'w') as f:
    while count < 500:
        a = random.randint(1, chr_adit[-1]) # random point in "total" genome length
        c = 0
        for j in range(len(chr_adit)): # find chr on which a
            if chr_adit[j] >= a:
                c = j
                break
        if chr_len[c] < 1000:
            continue
        ext = "https://rest.ensembl.org/sequence/region/human/" + chr_num[c] + ":" + str(chr_adit[j] - a) + ".." + str(chr_adit[j] - a + 998) + ":1?expand_3prime=0;expand_5prime=0"
        r = requests.get(ext, headers={"Content-Type": "text/x-fasta"})
        if not r.ok: # for HTTPerror
            r.raise_for_status()
            sys.exit()
        r = r.text[-999:]
        s = ''.join(ch for ch in r if ch.isalnum())
        l = s[7:].find("ATG")
        if l == -1:
            continue
        s = s[l: l + 13]
        f.write(">" + str(count) +"\n")
        f.write(s+"\n")
        count += 1

# generate IC matrix

with open('learn.fasta', 'r') as f:
    d = {"A" : 0, "T" : 1, "G" : 2, "C" : 3}
    let = ["A", "T", "G", "C"]
    fr = {0 : 0.295, 1 : 0.295, 2 : 0.205, 3 : 0.205}
    ep = 0.2
    lines = f.readlines()
    Training_group = [line.strip() for line in lines if not line.startswith(">")]
    pnm = [[0]*4 for i in range(13)] #why not np.zeros(shape=(4, 13))
    pwm = [[0]*4 for i in range(13)]
    ic = [[0]*5 for i in range(13)]
    for index, line in enumerate(lines):
        if index % 2 == 0: # skip seq name
            continue
        for i in range(13):
            pnm[i][d[line[i]]] += 1 # count letters
    for i in range(13):
        for j in range(4):
            pwm[i][j] = math.log((pnm[i][j]+ep)/(500+ep)/fr[j]) # count score
    f.close()
    for i in range(13):
        for j in range(4):
            if pnm[i][j] == 0:
                ic[i][j] = 0
            else:
                ic[i][j] = math.log((pnm[i][j])/(500)/fr[j])/math.log(2)*((pnm[i][j])/(500))
        ic[i][4] = sum(ic[i][0:4])

# positive control

with open('test.fasta', 'r') as f:
    lines = f.readlines()
    Test_group = [line.strip() for line in lines if not line.startswith(">")]

# negative control

with open("negative.fasta", "r") as f:
    lines = f.readlines()
    Negative_control = [line.strip() for line in lines if not line.startswith(">")]

# generate output files

icsum = 0
for i in ic:
    icsum += sum(i)
    
with open("pwm.csv", "w") as r:
    r.write("letter,1,2,3,4,5,6,7,8,9,10,11,12,13\n")
    for i in range(4):
        r.write(let[i])
        for j in range(13):
            r.write("," + f"{pwm[j][i]:.02f}")
        r.write("\n")

    
with open("ic.csv", "w") as r:
    r.write("letter,1,2,3,4,5,6,7,8,9,10,11,12,13\n")
    for i in range(4):
        r.write(let[i])
        for j in range(13):
            r.write("," + f"{ic[j][i]:.02f}")
        r.write("\n")
    r.write("IC(j)")
    for j in range(13):
        r.write("," + f"{ic[j][4]:.02f}")
    r.write("\n")
        
def calc_weights(seqs, matrix):
    w_list = []
    for seq in seqs:
        w = 0
        for i in range(len(seq)):
            w += matrix[i][d[seq[i]]]
        w_list.append(w)
    return w_list

Training_group_w = calc_weights(Training_group, pwm)
Test_group_w = calc_weights(Test_group, pwm)
Negative_control_w = calc_weights(Negative_control, pwm)
In [3]:
top = int(max(max(Training_group_w),
              max(Test_group_w),
              max(Negative_control_w)))
bottom = int(min(min(Training_group_w),
                 min(Test_group_w),
                 min(Negative_control_w)))
bin_list = list(np.arange(bottom-1, top+1, 0.5))

plt.hist(Training_group_w, bin_list,
         histtype="step",
         edgecolor="#7B68EE",
         linewidth=1.5)

plt.hist(Test_group_w, bin_list,

         histtype="step",
         edgecolor="#FFA500",
         linewidth=1.5)

plt.hist(Negative_control_w, bin_list,
         histtype="step",
         edgecolor="#DC143C",
         linewidth=1.5)

plt.grid(which="major", axis="y", color="grey", alpha=0.3)
plt.title("Гистограмма распределений")
plt.xlabel("Вес последовательности")
plt.ylabel("Число последовательностей")
plt.legend(['Обучение',
            'Тестовая\nвыборка (+)',
            'Отрицательный\nконтроль (-)'
            ], loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig("hist.svg", bbox_inches="tight")
plt.savefig("hist.png", bbox_inches="tight")
In [4]:
thr = 4
learn_plus = len([i for i in Training_group_w if i >= thr])
test_plus = len([i for i in Test_group_w if i >= thr])
pseudo_plus = len([i for i in Negative_control_w if i >= thr])
with open("check.csv", "w") as r:
    print(",Обучение,Положительный контроль,Отрицательный контроль",
          file=r)
    print(f"Cигнал(+),"
          f"{learn_plus} "
          f"({(100*learn_plus/len(Training_group_w)):.01f}%),"
          f"{test_plus} "
          f"({(100*test_plus/len(Test_group_w)):.01f}%),"
          f"{pseudo_plus} "
          f"({(100*pseudo_plus/len(Negative_control_w)):.01f}%)",
          file=r)
    print(f"Cигнал(-),"
          f"{len(Training_group_w)-learn_plus} "
          f"({100-(100*learn_plus/len(Training_group_w)):.01f}%),"
          f"{len(Training_group_w)-test_plus} "
          f"({100-(100*test_plus/len(Test_group_w)):.01f}%),"
          f"{len(Training_group_w)-pseudo_plus} "
          f"({100-(100*pseudo_plus/len(Negative_control_w)):.01f}%)",
          file=r)
In [ ]: