import random
import requests
import sys
import math
from statsmodels.stats.weightstats import ttest_ind as ttest
import matplotlib.pyplot as plt
import numpy as np
#generate random set of Kozak sequences
with open('human-genes.tsv', 'r') as file:
lines = file.readlines()[1:] # remove the table header
with open('learn.fasta', 'w') as f, open('test.fasta', 'w') as g:
random.shuffle(lines)
count = 0
errorcnt = 0
for index, line in enumerate(lines):
b = line.split("\t")
if b[4] == "+":
ext = "https://rest.ensembl.org" +"/sequence/region/human/" + b[0][3:] + ":" + str(int(b[6]) - 6) + ".." + str(int(b[6]) + 6) + ":1?expand_3prime=0;expand_5prime=0"
elif b[4] == "-":
ext = "https://rest.ensembl.org" + "/sequence/region/human/" + b[0][3:] + ":" + str(int(b[7]) - 5) + ".." + str(int(b[7]) + 7) + ":-1?expand_3prime=0;expand_5prime=0"
try:
r = requests.get(ext, headers={"Content-Type": "text/x-fasta"}) # download seq
except (requests.ConnectionError, requests.Timeout): # in case server doesn't answer
errorcnt += 1
if errorcnt < 50: # in case server doesn't answer very often
continue
else:
break
if not r.ok: # for HTTPerror, not sure
r.raise_for_status()
sys.exit()
if r.text[-7:-4] == "ATG" :
count += 1
if count <= 500: #first 500 seq for learning
f.write(r.text)
elif count <= 1000: #next 500 seq for test
g.write(r.text)
if count == 1000:
break
lines = []
#generate pseudo Kozak sequences
chr_num = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y']
chr_len = [248956422, 242193529, 198295559, 190214555, 181538259, 170805979, 159345973, 145138636, 138394717, 133797422, 135086622, 133275309, 114364328, 107043718, 101991189, 90338345, 83257441, 80373285, 58617616, 64444167, 46709983, 50818468, 156040895, 57227415]
chr_adit = [248956422, 491149951, 689445510, 879660065, 1061198324, 1232004303, 1391350276, 1536488912, 1674883629, 1808681051, 1943767673, 2077042982, 2191407310, 2298451028, 2400442217, 2490780562, 2574038003, 2654411288, 2713028904, 2777473071, 2824183054, 2875001522, 3031042417, 3088269832]
count = 0
with open('negative.fasta', 'w') as f:
while count < 500:
a = random.randint(1, chr_adit[-1]) # random point in "total" genome length
c = 0
for j in range(len(chr_adit)): # find chr on which a
if chr_adit[j] >= a:
c = j
break
if chr_len[c] < 1000:
continue
ext = "https://rest.ensembl.org/sequence/region/human/" + chr_num[c] + ":" + str(chr_adit[j] - a) + ".." + str(chr_adit[j] - a + 998) + ":1?expand_3prime=0;expand_5prime=0"
r = requests.get(ext, headers={"Content-Type": "text/x-fasta"})
if not r.ok: # for HTTPerror
r.raise_for_status()
sys.exit()
r = r.text[-999:]
s = ''.join(ch for ch in r if ch.isalnum())
l = s[7:].find("ATG")
if l == -1:
continue
s = s[l: l + 13]
f.write(">" + str(count) +"\n")
f.write(s+"\n")
count += 1
# generate IC matrix
with open('learn.fasta', 'r') as f:
d = {"A" : 0, "T" : 1, "G" : 2, "C" : 3}
let = ["A", "T", "G", "C"]
fr = {0 : 0.295, 1 : 0.295, 2 : 0.205, 3 : 0.205}
ep = 0.2
lines = f.readlines()
Training_group = [line.strip() for line in lines if not line.startswith(">")]
pnm = [[0]*4 for i in range(13)] #why not np.zeros(shape=(4, 13))
pwm = [[0]*4 for i in range(13)]
ic = [[0]*5 for i in range(13)]
for index, line in enumerate(lines):
if index % 2 == 0: # skip seq name
continue
for i in range(13):
pnm[i][d[line[i]]] += 1 # count letters
for i in range(13):
for j in range(4):
pwm[i][j] = math.log((pnm[i][j]+ep)/(500+ep)/fr[j]) # count score
f.close()
for i in range(13):
for j in range(4):
if pnm[i][j] == 0:
ic[i][j] = 0
else:
ic[i][j] = math.log((pnm[i][j])/(500)/fr[j])/math.log(2)*((pnm[i][j])/(500))
ic[i][4] = sum(ic[i][0:4])
# positive control
with open('test.fasta', 'r') as f:
lines = f.readlines()
Test_group = [line.strip() for line in lines if not line.startswith(">")]
# negative control
with open("negative.fasta", "r") as f:
lines = f.readlines()
Negative_control = [line.strip() for line in lines if not line.startswith(">")]
# generate output files
icsum = 0
for i in ic:
icsum += sum(i)
with open("pwm.csv", "w") as r:
r.write("letter,1,2,3,4,5,6,7,8,9,10,11,12,13\n")
for i in range(4):
r.write(let[i])
for j in range(13):
r.write("," + f"{pwm[j][i]:.02f}")
r.write("\n")
with open("ic.csv", "w") as r:
r.write("letter,1,2,3,4,5,6,7,8,9,10,11,12,13\n")
for i in range(4):
r.write(let[i])
for j in range(13):
r.write("," + f"{ic[j][i]:.02f}")
r.write("\n")
r.write("IC(j)")
for j in range(13):
r.write("," + f"{ic[j][4]:.02f}")
r.write("\n")
def calc_weights(seqs, matrix):
w_list = []
for seq in seqs:
w = 0
for i in range(len(seq)):
w += matrix[i][d[seq[i]]]
w_list.append(w)
return w_list
Training_group_w = calc_weights(Training_group, pwm)
Test_group_w = calc_weights(Test_group, pwm)
Negative_control_w = calc_weights(Negative_control, pwm)