Скрипт, читающий файл генома |
|
Скачать скрипт
Чтобы запустить скрипт в коммандную строку нужно ввести имя самого скрипта, имя читаемого файла и название папки, куда будет сохраняться информация. Например: Genome_reader.py NC_004193.gbk O.iheyensis #Head# MSU. Bioinformatics # Term1. Block2. Praktice 8. Task 2. # Genome_reader 2.0 (added possibility to make new fold, ehere files will be saved and possibility to get information about RNAs. Operating time of script is significantly reduced) # This script reads file *.gbk, create new folder and create in this folder files, where information about genes, which encode proteins, different RNAs, is saved (for each type - new file). It also create file, where information about all "strange" genes is saved. (genes, whith "join" or genes, which are not exactly defined (genes with "<" or ">").) # Autor: Demkiv A.O. # Last modification date: 30.11.2013 #Information, which must be recieved from user import sys if len(sys.argv) != 3: print "Script should recieve 2 arguments" print "First - name of file to open" print "Second - name of fold, where information should be saved" exit() need_RNA = raw_input("Do you want to make file with information about RNA?(please, write 'yes' or 'no')").lower() #Open file fo reading file_name = sys.argv[1] gen_file = open(file_name) #Create new folder import os try: os.makedirs(sys.argv[2]) except OSError: pass #Create new files in the folder new_file = open("./%s/%s.txt"%(sys.argv[2], sys.argv[2]), "w") new_file.write("locus_tag\tfirst_nuc\tlast_nuc\torientation\tgene_name\trefseq_protein_id\tproduct") strange_file = open("./%s/Strange_genes_%s.txt" %(sys.argv[2], sys.argv[2]), "w") strange_file.write("locus_tag\tfirst_nuc\tlast_nuc\torientation\tgene_name\trefseq_protein_id\tproduct") if need_RNA == "yes": tRNA_file = open("./%s/tRNA_%s.txt"%(sys.argv[2], sys.argv[2]), "w") tRNA_file.write("locus_tag\tbegin\tend\torientation\tname\tproduct") rRNA_file = open("./%s/rRNA_%s.txt"%(sys.argv[2], sys.argv[2]), "w") rRNA_file.write("locus_tag\tbegin\tend\torientation\tname\tproduct") ncRNA_file = open("./%s/ncRNA_%s.txt"%(sys.argv[2], sys.argv[2]), "w") ncRNA_file.write("locus_tag\tbegin\tend\torientation\tname\tproduct") #Create class class Gene: begin = 1 end = 100 orientation = 1 locus_tag = "" name = "" protein_id = "" product = "" def protein_write(self): new_file.write("\n%s\t%s\t%s\t%s\t%s\t%s\t%s" %(self.locus_tag, self.begin, self.end, self.orientation, self.name, self.protein_id, self.product)) def tRNA_write(self): tRNA_file.write("\n%s\t%s\t%s\t%s\t%s\t%s"%(self.locus_tag, self.begin, self.end, self.orientation, self.name, self.product)) def rRNA_write(self): rRNA_file.write("\n%s\t%s\t%s\t%s\t%s\t%s"%(self.locus_tag, self.begin, self.end, self.orientation, self.name, self.product)) def ncRNA_write(self): ncRNA_file.write("\n%s\t%s\t%s\t%s\t%s\t%s"%(self.locus_tag, self.begin, self.end, self.orientation, self.name, self.product)) def strange_genes(self): strange_file.write("\n%s\t%s\t%s\t%s\t%s\t%s\t%s" %(self.locus_tag, self.begin, self.end, self.orientation, self.name, self.protein_id, self.product)) def get_dop(self, k, protein_list): line = protein_list[k].split("\n") for l in range(len(line)): lline = line[l].strip().split("=") if (lline[0] == "/gene"): self.name = lline[1].translate(None, '"') elif (lline[0] == "/locus_tag"): self.locus_tag = lline[1].translate(None, '"') elif (lline[0] == "/protein_id"): self.protein_id = lline[1].translate(None, '"') elif (lline[0] == "/product"): if (line[l].strip())[-1] != '"': x = 0 quote_found = False while not(quote_found): if x == 0: self.product = lline[1].translate(None, '"') x = x + 1 else: self.product = self.product + line[l+x].strip().translate(None, '"') if (line[l+x].strip())[-1] == '"': quote_found = True x = x + 1 else: self.product = lline[1].translate(None, '"') def end_begin_simple(self, buf1): self.begin = buf1[0].strip("(") self.end = buf1[-1].strip(")") def end_begin(self, buf1, protein_list, k): self.begin = buf1[0].strip("(") o = 0 string = " " while string[-1] != ")": string = protein_list[k].split("\n")[o].strip() buf1 = string.translate(None, "<>(),").split("..") o = o + 1 self.end = buf1[-1] def __init__(self): 0 #Create lists of CDS and RNA blocks protein_list = [] protein_list.append("") u = 0 CDS_block = True if need_RNA == "yes": tRNA_block = True tRNA_list = [] tRNA_list.append("") t = 0 rRNA_block = True rRNA_list = [] rRNA_list.append("") r = 0 ncRNA_block = True ncRNA_list = [] ncRNA_list.append("") nc = 0 for line in gen_file: #CDS block if " CDS " in line: protein_list.append("") u = u + 1 CDS_block = True if (CDS_block == True) and (len(line) > 8) and (line[5] != " ") and (line[5] != "C") and (line[6] != " ") and (line[6] != "D") and (line[7] != " ") and (line[7] != "S"): CDS_block = False if (CDS_block == True): protein_list[u] = protein_list[u] + line if need_RNA == "yes": #tRNA block if " tRNA " in line: tRNA_block = True tRNA_list.append("") t = t+1 if (tRNA_block == True) and (len(line) > 8) and (line[5] != " ") and (line[5] != "t") and (line[6] != " ") and (line[6] != "R") and (line[7] != " ") and (line[7] != "N") and (line[8] != " ") and (line[8] != "A"): tRNA_block = False if tRNA_block == True: tRNA_list[t] = tRNA_list[t] + line #rRNA block if " rRNA " in line: rRNA_block = True rRNA_list.append("") r = r+1 if (rRNA_block == True) and (len(line) > 8) and (line[5] != " ") and (line[5] != "r") and (line[6] != " ") and (line[6] != "R") and (line[7] != " ") and (line[7] != "N") and (line[8] != " ") and (line[8] != "A"): rRNA_block = False if rRNA_block == True: rRNA_list[r] = rRNA_list[r] + line #ncRNA block if " ncRNA " in line: ncRNA_block = True ncRNA_list.append("") nc = nc + 1 if (ncRNA_block == True) and (len(line) > 9) and (line[5] != " ") and (line[5] != "n") and (line[6] != " ") and (line[6] != "c") and (line[7] != " ") and (line[7] != "R") and (line[8] != " ") and (line[8] != "N") and (line[9] != " ") and (line[9] != "A"): ncRNA_block = False if ncRNA_block == True: ncRNA_list[nc] = ncRNA_list[nc] + line #Function to extract information def information(x, y, element): i = 0 for k in range(len(x)): buf = x[k].split("\n")[0].strip(element) coordinate = buf.strip() coordinate_buf = coordinate.translate(None, "().<>") if coordinate_buf.isdigit(): buf1 = coordinate.translate(None, "<>").split("..") y.append(Gene()) y[i].end_begin_simple(buf1) for g in coordinate: if (g == ">") or (g == "<"): y[i].orientation = 1111 break else: y[i].orientation = 1 y[i].get_dop(k, x) if y[i].orientation == 1111: y[i].strange_genes() i = i+1 elif coordinate_buf.strip("join").translate(None, ',').isdigit(): buf3 = coordinate.strip("join").translate(None, '<>').split(",") if buf3[-1] == '': buf3.remove('') buf1 = [buf3[0].split("..")[0]] + [buf3[-1].split("..")[1]] y.append(Gene()) y[i].end_begin(buf1, x, k) for g in coordinate: if (g == ">") or (g == "<"): y[i].orientation = 111 break else: y[i].orientation = 11 y[i].get_dop(k, x) y[i].strange_genes() i = i+1 else: buf2 = coordinate.strip("complement").translate(None, "<>()") if not(buf2.translate(None, ".").isdigit()): if buf2.strip("join").translate(None, "().,").isdigit(): buf3 = buf2.strip("join").split(",") if buf3[-1] == '': buf3.remove('') buf1 = [buf3[0].split("..")[0]] + [buf3[-1].split("..")[1]] y.append(Gene()) y[i].end_begin(buf1, x, k) for g in coordinate: if (g == ">") or (g == "<"): y[i].orientation = -111 break else: y[i].orientation = -11 y[i].get_dop(k, x) y[i].strange_genes() i = i+1 else: continue elif buf2.translate(None, ".").isdigit(): buf1 = buf2.split("..") y.append(Gene()) y[i].end_begin(buf1, x, k) for g in coordinate: if (g == ">") or (g == "<"): y[i].orientation = -1111 break else: y[i].orientation = -1 y[i].get_dop(k, x) if y[i].orientation == -1111: y[i].strange_genes() i = i+1 #Using function protein_llist = [] information(protein_list, protein_llist, " CDS ") for elem in protein_llist: elem.protein_write() if need_RNA=="yes": tRNA_llist = [] information(tRNA_list, tRNA_llist, " tRNA ") for elem in tRNA_llist: elem.tRNA_write() rRNA_llist = [] information(rRNA_list, rRNA_llist, " rRNA ") for elem in rRNA_llist: elem.rRNA_write() ncRNA_llist = [] information(ncRNA_list, ncRNA_llist, " ncRNA ") for elem in ncRNA_llist: elem.ncRNA_write() #Closing files new_file.close() strange_file.close() if need_RNA == "yes": tRNA_file.close() rRNA_file.close() ncRNA_file.close() Протестировать работу скрипта можно, например на этом файле: NC_004193.gbk - файл, содержащий геном бактерии Oceanobacillus iheyensis штамм HTE831. |
© Демкив Андрей 2013 | Дата последнего изменения: 29.05.2015 |