Скрипт, читающий файл генома |
|
|
Скачать скрипт
Чтобы запустить скрипт в коммандную строку нужно ввести имя самого скрипта, имя читаемого файла и название папки, куда будет сохраняться информация. Например: Genome_reader.py NC_004193.gbk O.iheyensis #Head# MSU. Bioinformatics # Term1. Block2. Praktice 8. Task 2. # Genome_reader 2.0 (added possibility to make new fold, ehere files will be saved and possibility to get information about RNAs. Operating time of script is significantly reduced) # This script reads file *.gbk, create new folder and create in this folder files, where information about genes, which encode proteins, different RNAs, is saved (for each type - new file). It also create file, where information about all "strange" genes is saved. (genes, whith "join" or genes, which are not exactly defined (genes with "<" or ">").) # Autor: Demkiv A.O. # Last modification date: 30.11.2013 #Information, which must be recieved from user import sys
if len(sys.argv) != 3:
print "Script should recieve 2 arguments"
print "First - name of file to open"
print "Second - name of fold, where information should be saved"
exit()
need_RNA = raw_input("Do you want to make file with information about RNA?(please, write 'yes' or 'no')").lower()
#Open file fo reading file_name = sys.argv[1] gen_file = open(file_name) #Create new folder import os
try:
os.makedirs(sys.argv[2])
except OSError:
pass
#Create new files in the folder new_file = open("./%s/%s.txt"%(sys.argv[2], sys.argv[2]), "w")
new_file.write("locus_tag\tfirst_nuc\tlast_nuc\torientation\tgene_name\trefseq_protein_id\tproduct")
strange_file = open("./%s/Strange_genes_%s.txt" %(sys.argv[2], sys.argv[2]), "w")
strange_file.write("locus_tag\tfirst_nuc\tlast_nuc\torientation\tgene_name\trefseq_protein_id\tproduct")
if need_RNA == "yes":
tRNA_file = open("./%s/tRNA_%s.txt"%(sys.argv[2], sys.argv[2]), "w")
tRNA_file.write("locus_tag\tbegin\tend\torientation\tname\tproduct")
rRNA_file = open("./%s/rRNA_%s.txt"%(sys.argv[2], sys.argv[2]), "w")
rRNA_file.write("locus_tag\tbegin\tend\torientation\tname\tproduct")
ncRNA_file = open("./%s/ncRNA_%s.txt"%(sys.argv[2], sys.argv[2]), "w")
ncRNA_file.write("locus_tag\tbegin\tend\torientation\tname\tproduct")
#Create class
class Gene:
begin = 1
end = 100
orientation = 1
locus_tag = ""
name = ""
protein_id = ""
product = ""
def protein_write(self):
new_file.write("\n%s\t%s\t%s\t%s\t%s\t%s\t%s" %(self.locus_tag, self.begin, self.end, self.orientation, self.name, self.protein_id, self.product))
def tRNA_write(self):
tRNA_file.write("\n%s\t%s\t%s\t%s\t%s\t%s"%(self.locus_tag, self.begin, self.end, self.orientation, self.name, self.product))
def rRNA_write(self):
rRNA_file.write("\n%s\t%s\t%s\t%s\t%s\t%s"%(self.locus_tag, self.begin, self.end, self.orientation, self.name, self.product))
def ncRNA_write(self):
ncRNA_file.write("\n%s\t%s\t%s\t%s\t%s\t%s"%(self.locus_tag, self.begin, self.end, self.orientation, self.name, self.product))
def strange_genes(self):
strange_file.write("\n%s\t%s\t%s\t%s\t%s\t%s\t%s" %(self.locus_tag, self.begin, self.end, self.orientation, self.name, self.protein_id, self.product))
def get_dop(self, k, protein_list):
line = protein_list[k].split("\n")
for l in range(len(line)):
lline = line[l].strip().split("=")
if (lline[0] == "/gene"):
self.name = lline[1].translate(None, '"')
elif (lline[0] == "/locus_tag"):
self.locus_tag = lline[1].translate(None, '"')
elif (lline[0] == "/protein_id"):
self.protein_id = lline[1].translate(None, '"')
elif (lline[0] == "/product"):
if (line[l].strip())[-1] != '"':
x = 0
quote_found = False
while not(quote_found):
if x == 0:
self.product = lline[1].translate(None, '"')
x = x + 1
else:
self.product = self.product + line[l+x].strip().translate(None, '"')
if (line[l+x].strip())[-1] == '"':
quote_found = True
x = x + 1
else:
self.product = lline[1].translate(None, '"')
def end_begin_simple(self, buf1):
self.begin = buf1[0].strip("(")
self.end = buf1[-1].strip(")")
def end_begin(self, buf1, protein_list, k):
self.begin = buf1[0].strip("(")
o = 0
string = " "
while string[-1] != ")":
string = protein_list[k].split("\n")[o].strip()
buf1 = string.translate(None, "<>(),").split("..")
o = o + 1
self.end = buf1[-1]
def __init__(self):
0
#Create lists of CDS and RNA blocks
protein_list = []
protein_list.append("")
u = 0
CDS_block = True
if need_RNA == "yes":
tRNA_block = True
tRNA_list = []
tRNA_list.append("")
t = 0
rRNA_block = True
rRNA_list = []
rRNA_list.append("")
r = 0
ncRNA_block = True
ncRNA_list = []
ncRNA_list.append("")
nc = 0
for line in gen_file:
#CDS block
if " CDS " in line:
protein_list.append("")
u = u + 1
CDS_block = True
if (CDS_block == True) and (len(line) > 8) and (line[5] != " ") and (line[5] != "C") and (line[6] != " ") and (line[6] != "D") and (line[7] != " ") and (line[7] != "S"):
CDS_block = False
if (CDS_block == True):
protein_list[u] = protein_list[u] + line
if need_RNA == "yes":
#tRNA block
if " tRNA " in line:
tRNA_block = True
tRNA_list.append("")
t = t+1
if (tRNA_block == True) and (len(line) > 8) and (line[5] != " ") and (line[5] != "t") and (line[6] != " ") and (line[6] != "R") and (line[7] != " ") and (line[7] != "N") and (line[8] != " ") and (line[8] != "A"):
tRNA_block = False
if tRNA_block == True:
tRNA_list[t] = tRNA_list[t] + line
#rRNA block
if " rRNA " in line:
rRNA_block = True
rRNA_list.append("")
r = r+1
if (rRNA_block == True) and (len(line) > 8) and (line[5] != " ") and (line[5] != "r") and (line[6] != " ") and (line[6] != "R") and (line[7] != " ") and (line[7] != "N") and (line[8] != " ") and (line[8] != "A"):
rRNA_block = False
if rRNA_block == True:
rRNA_list[r] = rRNA_list[r] + line
#ncRNA block
if " ncRNA " in line:
ncRNA_block = True
ncRNA_list.append("")
nc = nc + 1
if (ncRNA_block == True) and (len(line) > 9) and (line[5] != " ") and (line[5] != "n") and (line[6] != " ") and (line[6] != "c") and (line[7] != " ") and (line[7] != "R") and (line[8] != " ") and (line[8] != "N") and (line[9] != " ") and (line[9] != "A"):
ncRNA_block = False
if ncRNA_block == True:
ncRNA_list[nc] = ncRNA_list[nc] + line
#Function to extract information
def information(x, y, element):
i = 0
for k in range(len(x)):
buf = x[k].split("\n")[0].strip(element)
coordinate = buf.strip()
coordinate_buf = coordinate.translate(None, "().<>")
if coordinate_buf.isdigit():
buf1 = coordinate.translate(None, "<>").split("..")
y.append(Gene())
y[i].end_begin_simple(buf1)
for g in coordinate:
if (g == ">") or (g == "<"):
y[i].orientation = 1111
break
else:
y[i].orientation = 1
y[i].get_dop(k, x)
if y[i].orientation == 1111:
y[i].strange_genes()
i = i+1
elif coordinate_buf.strip("join").translate(None, ',').isdigit():
buf3 = coordinate.strip("join").translate(None, '<>').split(",")
if buf3[-1] == '':
buf3.remove('')
buf1 = [buf3[0].split("..")[0]] + [buf3[-1].split("..")[1]]
y.append(Gene())
y[i].end_begin(buf1, x, k)
for g in coordinate:
if (g == ">") or (g == "<"):
y[i].orientation = 111
break
else:
y[i].orientation = 11
y[i].get_dop(k, x)
y[i].strange_genes()
i = i+1
else:
buf2 = coordinate.strip("complement").translate(None, "<>()")
if not(buf2.translate(None, ".").isdigit()):
if buf2.strip("join").translate(None, "().,").isdigit():
buf3 = buf2.strip("join").split(",")
if buf3[-1] == '':
buf3.remove('')
buf1 = [buf3[0].split("..")[0]] + [buf3[-1].split("..")[1]]
y.append(Gene())
y[i].end_begin(buf1, x, k)
for g in coordinate:
if (g == ">") or (g == "<"):
y[i].orientation = -111
break
else:
y[i].orientation = -11
y[i].get_dop(k, x)
y[i].strange_genes()
i = i+1
else:
continue
elif buf2.translate(None, ".").isdigit():
buf1 = buf2.split("..")
y.append(Gene())
y[i].end_begin(buf1, x, k)
for g in coordinate:
if (g == ">") or (g == "<"):
y[i].orientation = -1111
break
else:
y[i].orientation = -1
y[i].get_dop(k, x)
if y[i].orientation == -1111:
y[i].strange_genes()
i = i+1
#Using function
protein_llist = []
information(protein_list, protein_llist, " CDS ")
for elem in protein_llist:
elem.protein_write()
if need_RNA=="yes":
tRNA_llist = []
information(tRNA_list, tRNA_llist, " tRNA ")
for elem in tRNA_llist:
elem.tRNA_write()
rRNA_llist = []
information(rRNA_list, rRNA_llist, " rRNA ")
for elem in rRNA_llist:
elem.rRNA_write()
ncRNA_llist = []
information(ncRNA_list, ncRNA_llist, " ncRNA ")
for elem in ncRNA_llist:
elem.ncRNA_write()
#Closing files
new_file.close()
strange_file.close()
if need_RNA == "yes":
tRNA_file.close()
rRNA_file.close()
ncRNA_file.close()
Протестировать работу скрипта можно, например на этом файле: NC_004193.gbk - файл, содержащий геном бактерии Oceanobacillus iheyensis штамм HTE831. |
|
| © Демкив Андрей 2013 | Дата последнего изменения: 29.05.2015 |