#!/usr/bin/env python3

import re


def parse_fuzznuc(fuzznuc_file):
    hits = []
    with open(fuzznuc_file, 'r') as f:
        for line in f:
            if line.strip() and line.split()[0].isdigit():
                parts = line.split()
                hits.append({'start': int(parts[0]), 'end': int(parts[1]), 'strand': parts[2]})
    return hits


def parse_gff_starts(gff_file):
    starts = {'+': [], '-': []}
    with open(gff_file, 'r') as f:
        for line in f:
            if line.startswith('#') or not line.strip(): continue
            parts = line.split('\t')
            if len(parts) > 4 and parts[2] == 'CDS':
                strand = parts[6]
                start_pos = int(parts[3]) if strand == '+' else int(parts[4])
                starts[strand].append(start_pos)
    return starts


# Пути к вашим файлам (замените на свои)
fuzz_hits = parse_fuzznuc('~/term4/pr8/second.txt')
cds_starts = parse_gff_starts('~/term1/genome/GCF_014672695.1_ASM1467269v1_genomic.gff')

count = 0
total_sd = len(fuzz_hits)

for sd in fuzz_hits:
    found_for_this_sd = False
    if sd['strand'] == '+':
        for cds in cds_starts['+']:
            dist = cds - sd['end']
            if 6 <= dist <= 10:
                found_for_this_sd = True
                break
    else:
        for cds in cds_starts['-']:
            dist = sd['start'] - cds
            if 6 <= dist <= 10:
                found_for_this_sd = True
                break

    if found_for_this_sd:
        count += 1

print(f"Всего находок ПШД: {total_sd}")
print(f"Из них в 'правильной' позиции (6-10 нт до CDS): {count}")
print(f"Процент: {(count / total_sd) * 100:.2f}%")
