# MSU FBB. Bioinformatics. Term 1. Block 2. Practice 8. Task +.
# Chromosomal table v1.0.
# Author: Pogorelskaya A.M.
# Last modification date: 16.11.2013 11:00.

import sys
if len(sys.argv) != 2:
    print "Script should receive 1 argument:"
    print "the name of GenBank file;"
    sys.exit()
file_r = sys.argv[1]
filewrite = open ("Pogorelskaya_NC_007759_genes.txt", "w")
fileread = open (file_r)
filewrite.write("#Locus_tag\tGene_name\tRefseq_protein_id\tProduct\tBegin\tEnd\tOrientation\tNote\n")

class ProteinCodingGene:
    begin = ""
    end = ""
    orientation = ""
    locus_tag = ""
    gene = ""
    protein_id = ""
    product = ""
    note = ""
    def __init__ (self,b,e,o,l,g,i,p,n):
        self.begin = b
        self.end = e
        self.orientation = o
        self.locus_tag = l
        self.gene = g
        self.protein_id = i
        self.product = p
        self.note = n
    def get_data(self):
        line = self.locus_tag+"\t"+self.gene+"\t"+self.protein_id+"\t"+self.product+"\t"+self.begin+"\t"+self.end+"\t"+self.orientation+"\t"+self.note+"\n"
        # Line with all data about the protein.
        return line
def str_with_CDS(line,locus_tag,gene,protein_id,product,note):
    if line.split("(")[0] == "complement":
        line = line.replace("("," ")
        if len(line.split()) != 2:
            if line.split()[1].split("(")[0] == "join":
                begin = line.split()[2].split("..")[0]
                end = line.split()[2].split("..")[-1].strip(")")
                chain = "-11"
            else:
                begin = ""
                end = ""
                chain = ""
        elif not line.split()[1].split("..")[0].isdigit() or not line.split()[1].split("..")[1].strip(")").isdigit():
            line = line.translate(None,"><()")
            begin = line.split()[1].split("..")[0]
            end = line.split()[1].split("..")[1]
            chain = "-111"
        else:
            begin = line.split()[1].split("..")[0]
            end = line.split()[1].split("..")[1].strip(")")
            chain = "-1"
    elif line.split("(")[0] == "join":
        begin = line.split("(")[1].split("..")[0]
        end = line.split("(")[1].split("..")[-1].strip(")")
        chain = "11"
    elif not line.split("..")[0].isdigit() or not line.split("..")[1].isdigit():
        line = line.translate(None,"><")
        begin = line.split("..")[0]
        end = line.split("..")[-1]
        chain = "111"       
    else:
        begin = line.split("..")[0]
        end = line.split("..")[1]
        chain = "1" 
    # This part of the function gets the information about the orientation of the sequence and the way it appears in genome.
    onegene = ProteinCodingGene(begin,end,chain,locus_tag,gene,protein_id,product,note)
    # A new object in class Protein is made.
    return onegene
prodind = False
noteind = False
lineind = False
ind = False
descdict = {}
descdict["locus_tag"] = ""
descdict["gene"] = ""
descdict["protein_id"] = ""
descdict["product"] = ""
descdict["note"] = ""
line = ""
gene_list = []
for temp_str in fileread:
    temp_str = temp_str.strip()
    if temp_str == "":
        continue
    if prodind:
        descdict["product"] = descdict["product"]+" "+temp_str.strip('"')
        if temp_str[-1] == '"':
            prodind = False
        # The symbol '"' means that product finished.
    elif noteind: 
        descdict["note"] = descdict["note"]+" "+temp_str.strip('"')
        if temp_str[-1] == '"':
            noteind = False
    elif lineind:
        line = line + temp_str
        if line[-1] != ",":
            lineind = False         
    elif len(temp_str.split()) != 1:            
        if temp_str.split()[0] == "misc_feature" or temp_str.split()[0] == "gene":
        # All data has already got because misc_future and gene(if it is the name of a block of data appears when CDS finishes
        # If "gene" is a part of note or product, then it is out of CDS, because this part of the program begins when note and product has ended.
            if ind:
                gene_list.append(str_with_CDS(line,descdict["locus_tag"],descdict["gene"],descdict["protein_id"],descdict["product"],descdict["note"]))
                ind = False
                line = ""
                descdict["locus_tag"] = ""
                descdict["gene"] = ""
                descdict["protein_id"] = ""
                descdict["product"] = ""
                descdict["note"] = ""
        if temp_str.split()[0] == "CDS":
            ind = True
            lineind = True
            for i in range(1,len(temp_str.split())):
                line = line + temp_str.split()[i]
            if line[-1] != ",":
            # If coordinates of the gene are complex and their description lies on 2 or more lines.
                lineind = False    
    if ind and temp_str[0] == "/":
        temp_str = temp_str.strip("/")
        if temp_str.split("=")[0] in descdict:
            if temp_str.split("=")[0] == "product":      
                descdict["product"] = temp_str.split("=")[1].strip('"')            
                if temp_str[-1] != '"':
                # If Product is described in not one line.
                    prodind = True
            elif temp_str.split("=")[0] == "note":                 
                descdict["note"] = temp_str.split("=")[1].strip('"')
                if temp_str[-1] != '"':
                # If note is described in not one line.
                    noteind = True
            else:
                descdict[temp_str.split("=")[0]] = temp_str.split("=")[1].strip('"')
        
fileread.close()
for gene in gene_list:
    filewrite.write (gene.get_data())
filewrite.close()
  





© Pogorelskaya Sasha Last modification date 13.12.13