"""
parse_needle.py

Parses a needle alignment file and extracts:
  - Sequence names (IDs)
  - Score
  - % Identity
  - % Similarity
  - Gaps (number)
  - Indels (runs of consecutive gaps) in each sequence

Usage:
    python parse_needle.py file.needle

Output: tab-separated line suitable for building a table.
"""

import sys
import re

def count_indels(sequence):
    return len(re.findall(r'-+', sequence))

def parse_needle(filename):
    with open(filename) as f:
        content = f.read()
    lines = content.splitlines()

    result = {}

    # Extract header fields
    for line in lines:
        if line.startswith('# 1:'):
            result['id1'] = line.split(':', 1)[1].strip()
        elif line.startswith('# 2:'):
            result['id2'] = line.split(':', 1)[1].strip()
        elif line.startswith('# Score:'):
            result['score'] = line.split(':', 1)[1].strip()
        elif line.startswith('# Identity:'):
            m = re.search(r'(\d+)/\d+\s*\(\s*([\d.]+)%\)', line)
            if m:
                result['identity_pct'] = m.group(2) + '%'
        elif line.startswith('# Similarity:'):
            m = re.search(r'(\d+)/\d+\s*\(\s*([\d.]+)%\)', line)
            if m:
                result['similarity_pct'] = m.group(2) + '%'
        elif line.startswith('# Gaps:'):
            m = re.search(r'(\d+)/\d+', line)
            if m:
                result['gaps'] = m.group(1)

    # Extract sequences
    name1 = result.get('id1', '')
    name2 = result.get('id2', '')
    seq1_parts = []
    seq2_parts = []

    for line in lines:
        if line.startswith('#') or not line.strip():
            continue
        m = re.match(r'^(\S+)\s+\d+\s+([A-Za-z\-]+)\s+\d+', line)
        if m:
            sname = m.group(1)
            res = m.group(2)
            if sname == name1:
                seq1_parts.append(res)
            elif sname == name2:
                seq2_parts.append(res)

    seq1 = ''.join(seq1_parts)
    seq2 = ''.join(seq2_parts)

    result['indels1'] = count_indels(seq1)
    result['indels2'] = count_indels(seq2)
    result['total_indels'] = result['indels1'] + result['indels2']

    return result

def main():
    if len(sys.argv) < 2:
        print("Usage: python parse_needle.py file1.needle [file2.needle ...]")
        sys.exit(1)

    print(f"{'ID1':<20} {'ID2':<20} {'Score':>8} {'%Ident':>8} {'%Simil':>8} {'Gaps':>6} {'Indels':>7}")
    print("-" * 85)

    for filename in sys.argv[1:]:
        r = parse_needle(filename)
        print(f"{r.get('id1','?'):<20} {r.get('id2','?'):<20} "
              f"{r.get('score','?'):>8} {r.get('identity_pct','?'):>8} "
              f"{r.get('similarity_pct','?'):>8} {r.get('gaps','?'):>6} "
              f"{r.get('total_indels','?'):>7}")

if __name__ == "__main__":
    main()
