import re
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde

# input_file = "C:/Users/jfedo/OneDrive/Documents/public_html/term4/pr11/uniprotkb_xref_pfam_PF00808_2026_06_25.tsv"  # Замените на имя вашего файла, если оно другое
# output_fasta = "C:/Users/jfedo/OneDrive/Documents/public_html/term4/pr11/PF00808_pairs.fasta"
# df = pd.read_csv(input_file, sep="\t")
# df.columns = [col.strip() for col in df.columns]
# with open(output_fasta, "w") as out_file:
#     for idx, row in df.iterrows():
#         acc = row["Entry"]
#         sequence = str(row["Sequence"])
#         domain_ft = str(row["Domain [FT]"])
#         if pd.isna(row["Domain [FT]"]) or domain_ft == "nan":
#             continue
#         pattern = 'PF00808'
#         matches = re.findall(pattern, domain_ft)
#         if len(matches) == 2:
#             first_start = domain_ft.split('..')[0].split(' ')[1]
#             first_end = domain_ft.split('..')[1].split(';')[0]
#             second_start = domain_ft.split('..')[1].split(' ')[-1]
#             second_end = domain_ft.split('..')[2].split(';')[0]
#             for start_str, end_str in ((first_start, first_end), (second_start, second_end)):
#                 start = int(start_str)
#                 end = int(end_str)
#                 domain_seq = sequence[start - 1 : end]
#                 out_file.write(f">{acc}_PF00808_{start}_{end}\n{domain_seq}\n")

def kde_plot_maker(v, ax, name):
    x = np.log10(v)
    k = gaussian_kde(x)
    y = k(x)
    ax.plot(x, y, label=name)
    return np.max(y)

def selector():
    selected = []
    rest = []
    with open('C:/Users/jfedo/OneDrive/Documents/public_html/term4/pr11/search_results.txt', 'r') as txt:
        with open('C:/Users/jfedo/OneDrive/Documents/public_html/term4/pr11/PF00808_pairs.fasta', 'r') as fasta:
            acs = []
            lines = fasta.readlines()
            for ac in lines:
                if ac.startswith('>'):
                    acs.append(ac[1:].split('_')[0])
            for i in range(15):
                txt.readline()
            for i in range(28959):
                line = [s for s in txt.readline().strip().split(' ') if len(s) > 0][:9]
                if line[8].split('|')[1] in acs:
                    selected.append(float(line[0]))
                else:
                    rest.append(float(line[0]))
        threshold = -33
        fig, ax = plt.subplots()
        # ax.plot(np.log10(rest))
        # ax.plot([0, len(rest)], [threshold, threshold], ':')
        ymax = 0
        ymax = max(ymax, kde_plot_maker(selected, ax, 'selected'))
        ymax = max(ymax, kde_plot_maker(rest, ax, 'rest'))
        ax.plot([threshold, threshold], [0, ymax], ':')
        ax.set(title='Плотность десятичных логарифмов для подсемейства и остальных белков')
        ax.legend()
        ax.grid()
selector()
plt.show()