from urllib import request
import json
import time


def domain_coords(AC_list, domain_AC):
    output = {}
    i = 0
    print(f"Fetching coordinates for domain {domain_AC}")
    for AC in AC_list:
        url = f"https://www.ebi.ac.uk/interpro/api/protein/uniprot/{AC}/entry/pfam/{domain_AC}"
        req = request.Request(url)
        res = request.urlopen(req)
        out = res.read().decode()
        out = json.loads(out)
        data = out["entries"][0]["entry_protein_locations"][0]["fragments"]
        start = data[0]["start"]
        end = data[0]["end"]
        output[AC] = (start, end)
        i += 1
        if i%10 == 0 and i != len(AC_list): print(f"{i} of {len(AC_list)} is done")
        if i == len(AC_list): print("Done")
        time.sleep(1)
    return output



subfamily = {}
domains_CsgG = {}
domains_PGBD = {}

with open("CsgG_sub.fasta", "r") as file:
    fasta = file.read().split(">")[1 : ]

for seq in fasta:
    head = seq.split("\n")[0]
    AC = head.split("|")[0]
    subfamily[AC] = "".join(seq.split("\n")[1 : ])

coords_CsgG = domain_coords(list(subfamily.keys()), "PF03783")
coords_PGBD = domain_coords(list(subfamily.keys()), "PF01471")

for AC in subfamily.keys():
    seq = subfamily[AC]
    location_CsgG = coords_CsgG[AC]
    location_PGBD = coords_PGBD[AC]
    domains_CsgG[AC] = seq[location_CsgG[0] - 1 : location_CsgG[1]]
    domains_PGBD[AC] = seq[location_PGBD[0] - 1 : location_PGBD[1]]

with open("CsgG_CsgG.fasta", "w") as outfile:
    for ac, s in domains_CsgG.items():
        outfile.write(f">{ac}\n{s}\n")

with open("CsgG_PGBD.fasta", "w") as outfile:
    for ac, s in domains_PGBD.items():
        outfile.write(f">{ac}\n{s}\n")