import random compl = {"A": "T", "T": "A", "G": "C", "C": "G"} def my_cut(genome, start): my_start = start - 101 my_end = start - 1 if my_end > len(genome): before_seq = genome[my_start:] + genome[:my_end-len(genome)] else: before_seq = genome[my_start:my_end] return before_seq def revers_cut(genome, end): my_start = end my_end = end + 100 if my_end > len(genome): after_seq = genome[my_start:] + genome[:my_end-len(genome)] else: after_seq = genome[my_start:my_end] after_seq = list(after_seq) for elem in after_seq: elem = compl[elem] seq = "".join(after_seq)[::-1] return seq fasta = "GCF_000005845.2_ASM584v2_genomic.fna" operons_list = "list_of_operons.txt" with open(fasta) as inp: header = inp.readline().strip() name = header.split()[0][1:] seq = [] for line in inp: line = line.strip() seq.append(line) FULLSEQ = "".join(seq) OPERONS = [] with open(operons_list) as inp: header = inp.readline() line = inp.readline() i = 0 while line: line = line.strip().split("\t") if len(line) != 1: IdGene = line[0] start = int(line[3]) end = int(line[4]) strand = line[5] func = line[6] operon = (IdGene, start, end, strand, func) OPERONS.append(operon) line = inp.readline() PROMOTORS = dict() for operon in OPERONS: if operon[3] == "+": promotor_seq = my_cut(FULLSEQ, operon[1]) elif operon[3] == "-": promotor_seq = revers_cut(FULLSEQ, operon[2]) PROMOTORS[operon[0]] = promotor_seq with open("promotors.fasta", "w") as out: for IdGene, seq in PROMOTORS.items(): print(f">{IdGene}\n{seq}", file=out) HOUSEKEEPING = dict() key_words = ["ibosomal", "polymerase", "ranscription", "ranslation", "ATP synthase"] for operon in OPERONS: for word in key_words: if word in operon[4]: HOUSEKEEPING[operon[0]] = PROMOTORS[operon[0]] with open("housekeeping.fasta", "w") as out: for IdGene, seq in HOUSEKEEPING.items(): print(f">{IdGene}\n{seq}", file=out) with open("negative.fasta", "w") as out: NEGATIVES = dict() random.seed(75) rand_coords = [] for i in range(len(HOUSEKEEPING)): rand_coords.append(random.randint(0, len(FULLSEQ))) for i in rand_coords: NEGATIVES[i] = my_cut(FULLSEQ, i) for IdGene, seq in NEGATIVES.items(): print(f">{IdGene}\n{seq}", file=out)