import collections from scipy.stats import chisquare seq = '' count = 0 with open('sequence.fasta', 'r') as inp: line = inp.readline() while line: seq += line.strip() line = inp.readline() for i in range(len(seq)): if seq[i:i+6] == 'GAATTC': count += 1 print(count) nuc_counter = collections.Counter(seq) cnt_G = nuc_counter['G']/len(seq) cnt_C = nuc_counter['C']/len(seq) cnt_A = nuc_counter['A']/len(seq) cnt_T = nuc_counter['T']/len(seq) exp_num = round((cnt_G * (cnt_A ** 2) * (cnt_T ** 2) * cnt_C) * len(seq)) print(exp_num) p_value = chisquare([count, exp_num], ddof=0) print(p_value)