# MSU FBB. Bioinformatics. Term 1. Block 2. Practice 8. Task +.
# Chromosomal table v1.0.
# Author: Pogorelskaya A.M.
# Last modification date: 16.11.2013 11:00.
import sys
if len(sys.argv) != 2:
print "Script should receive 1 argument:"
print "the name of GenBank file;"
sys.exit()
file_r = sys.argv[1]
filewrite = open ("Pogorelskaya_NC_007759_genes.txt", "w")
fileread = open (file_r)
filewrite.write("#Locus_tag\tGene_name\tRefseq_protein_id\tProduct\tBegin\tEnd\tOrientation\tNote\n")
class ProteinCodingGene:
begin = ""
end = ""
orientation = ""
locus_tag = ""
gene = ""
protein_id = ""
product = ""
note = ""
def __init__ (self,b,e,o,l,g,i,p,n):
self.begin = b
self.end = e
self.orientation = o
self.locus_tag = l
self.gene = g
self.protein_id = i
self.product = p
self.note = n
def get_data(self):
line = self.locus_tag+"\t"+self.gene+"\t"+self.protein_id+"\t"+self.product+"\t"+self.begin+"\t"+self.end+"\t"+self.orientation+"\t"+self.note+"\n"
# Line with all data about the protein.
return line
def str_with_CDS(line,locus_tag,gene,protein_id,product,note):
if line.split("(")[0] == "complement":
line = line.replace("("," ")
if len(line.split()) != 2:
if line.split()[1].split("(")[0] == "join":
begin = line.split()[2].split("..")[0]
end = line.split()[2].split("..")[-1].strip(")")
chain = "-11"
else:
begin = ""
end = ""
chain = ""
elif not line.split()[1].split("..")[0].isdigit() or not line.split()[1].split("..")[1].strip(")").isdigit():
line = line.translate(None,"><()")
begin = line.split()[1].split("..")[0]
end = line.split()[1].split("..")[1]
chain = "-111"
else:
begin = line.split()[1].split("..")[0]
end = line.split()[1].split("..")[1].strip(")")
chain = "-1"
elif line.split("(")[0] == "join":
begin = line.split("(")[1].split("..")[0]
end = line.split("(")[1].split("..")[-1].strip(")")
chain = "11"
elif not line.split("..")[0].isdigit() or not line.split("..")[1].isdigit():
line = line.translate(None,"><")
begin = line.split("..")[0]
end = line.split("..")[-1]
chain = "111"
else:
begin = line.split("..")[0]
end = line.split("..")[1]
chain = "1"
# This part of the function gets the information about the orientation of the sequence and the way it appears in genome.
onegene = ProteinCodingGene(begin,end,chain,locus_tag,gene,protein_id,product,note)
# A new object in class Protein is made.
return onegene
prodind = False
noteind = False
lineind = False
ind = False
descdict = {}
descdict["locus_tag"] = ""
descdict["gene"] = ""
descdict["protein_id"] = ""
descdict["product"] = ""
descdict["note"] = ""
line = ""
gene_list = []
for temp_str in fileread:
temp_str = temp_str.strip()
if temp_str == "":
continue
if prodind:
descdict["product"] = descdict["product"]+" "+temp_str.strip('"')
if temp_str[-1] == '"':
prodind = False
# The symbol '"' means that product finished.
elif noteind:
descdict["note"] = descdict["note"]+" "+temp_str.strip('"')
if temp_str[-1] == '"':
noteind = False
elif lineind:
line = line + temp_str
if line[-1] != ",":
lineind = False
elif len(temp_str.split()) != 1:
if temp_str.split()[0] == "misc_feature" or temp_str.split()[0] == "gene":
# All data has already got because misc_future and gene(if it is the name of a block of data appears when CDS finishes
# If "gene" is a part of note or product, then it is out of CDS, because this part of the program begins when note and product has ended.
if ind:
gene_list.append(str_with_CDS(line,descdict["locus_tag"],descdict["gene"],descdict["protein_id"],descdict["product"],descdict["note"]))
ind = False
line = ""
descdict["locus_tag"] = ""
descdict["gene"] = ""
descdict["protein_id"] = ""
descdict["product"] = ""
descdict["note"] = ""
if temp_str.split()[0] == "CDS":
ind = True
lineind = True
for i in range(1,len(temp_str.split())):
line = line + temp_str.split()[i]
if line[-1] != ",":
# If coordinates of the gene are complex and their description lies on 2 or more lines.
lineind = False
if ind and temp_str[0] == "/":
temp_str = temp_str.strip("/")
if temp_str.split("=")[0] in descdict:
if temp_str.split("=")[0] == "product":
descdict["product"] = temp_str.split("=")[1].strip('"')
if temp_str[-1] != '"':
# If Product is described in not one line.
prodind = True
elif temp_str.split("=")[0] == "note":
descdict["note"] = temp_str.split("=")[1].strip('"')
if temp_str[-1] != '"':
# If note is described in not one line.
noteind = True
else:
descdict[temp_str.split("=")[0]] = temp_str.split("=")[1].strip('"')
fileread.close()
for gene in gene_list:
filewrite.write (gene.get_data())
filewrite.close()
© Pogorelskaya Sasha |
Last modification date 13.12.13 |