In [1]:
fasta = open('GCA_000008865.2_ASM886v2_genomic.fna', mode='r')
lst = []
for line in fasta:
    x = line.strip()
    if '>' not in x :
        lst.append(x)
sequence = ''.join(lst)
counter = 0
for i in range(len(sequence)-1):
    if sequence[i:i+2] == 'TA':
        counter += 1
print(counter)
365293
In [2]:
from scipy import stats
p_value = stats.binom_test(365293, len(sequence), 0.248 * 0.247, 'less')
p_value
Out[2]:
0.0