import random

compl = {"A": "T",
         "T": "A",
         "G": "C",
         "C": "G"}

def my_cut(genome, start):
    my_start = start - 101
    my_end = start - 1
    if my_end > len(genome):
        before_seq = genome[my_start:] + genome[:my_end-len(genome)]
    else:
        before_seq = genome[my_start:my_end]
    return before_seq

def revers_cut(genome, end):
    my_start = end
    my_end = end + 100
    if my_end > len(genome):
        after_seq = genome[my_start:] + genome[:my_end-len(genome)]
    else:
        after_seq = genome[my_start:my_end]
    after_seq = list(after_seq)
    for elem in after_seq:
        elem = compl[elem]
    seq = "".join(after_seq)[::-1]
    return seq


fasta = "GCA_900444805.1_58932_B01_genomic.fasta"
operons_list = "list_of_operons_720138.txt"


with open(fasta) as inp:
    header = inp.readline().strip()
    name = header.split()[0][1:]
    seq = []
    for line in inp: 
        line = line.strip()
        seq.append(line)
    FULLSEQ = "".join(seq)


OPERONS = []
with open(operons_list) as inp:
    header = inp.readline()
    line = inp.readline()
    i = 0
    while line:
        line = line.strip().split("\t")
        if len(line) != 1:
            IdGene = line[0]
            start = int(line[3])
            end = int(line[4])
            strand = line[5]
            func = line[6]
            operon = (IdGene, start, end, strand, func)
            OPERONS.append(operon)
        line = inp.readline()

PROMOTORS = dict()
for operon in OPERONS:
    if operon[3] == "+":
        promotor_seq = my_cut(FULLSEQ, operon[1])
    elif operon[3] == "-":
        promotor_seq = revers_cut(FULLSEQ, operon[2])
    PROMOTORS[operon[0]] = promotor_seq

with open("promotors.fasta", "w") as out:
    for IdGene, seq in PROMOTORS.items():
        print(f">{IdGene}\n{seq}", file=out)
        
HOUSEKEEPING = dict()
key_words = ["ibosomal", "polymerase", "ranscription",
             "ranslation", "ATP synthase"]


for operon in OPERONS:
    for word in key_words:
        if word in operon[4]:
            HOUSEKEEPING[operon[0]] = PROMOTORS[operon[0]]
  

with open("housekeeping.fasta", "w") as out:
    for IdGene, seq in HOUSEKEEPING.items():
        print(f">{IdGene}\n{seq}", file=out)  
        
with open("negative.fasta", "w") as out:
    NEGATIVES = dict()
    random.seed(75)
    rand_coords = []
    for i in range(len(HOUSEKEEPING)):
        rand_coords.append(random.randint(0, len(FULLSEQ)))
    for i in rand_coords:
       NEGATIVES[i] = my_cut(FULLSEQ, i)
    for IdGene, seq in NEGATIVES.items():
        print(f">{IdGene}\n{seq}", file=out)
