SEQUENCE = [] with open("GCF.fna") as genome: # read chromosome sequence from fasta-file genome.readline() for line in genome: SEQUENCE.append(line.strip()) SEQUENCE = "".join(SEQUENCE) CDS = [] with open("g.gff") as ft: # read all CDS and information about them from annotation file gff3 for line in ft: row = line.strip().split("\t") if (line.startswith("#") or not line.strip()) or row[2] != "CDS": continue start = int(row[3]) end = int(row[4]) strand = row[6] info = row[8].split(";")[-3][8:] CDS.append((start, end, strand, info)) def complement(seq): # function, that return complement seq for input seq pairs = {"A": "T", "G": "C", "C": "G", "T": "A"} result = [] for letter in seq: result.append(pairs[letter]) return "".join(result)[::-1] def cutter(seq, pos, number): # function, that cut a part with number length before (if number < 0) or after (number > 0) from seq for pos (include pos) if number > 0: if pos + number > len(seq): return seq[pos:] + seq[:pos + number - len(seq)] return seq[pos:pos + number] if pos + number < 0: return seq[pos + number + 1:] + seq[:pos + 1] return seq[pos + number:pos] with open("POSITIVE.fasta", "w") as file: # create fasta-file with positive-control group i = 1 _seq = "" for cds in CDS: print(f">Seq{i} {cds[3]}", file=file) if cds[2] == "+": _seq = cutter(SEQUENCE, cds[0], -26)[:-1] else: _seq = complement(cutter(SEQUENCE, cds[1], 26)[1:]) print(_seq, file=file) i += 1 with open("NEGATIVE.fasta", "w") as file: # create fasta-file with negative-control group i = 1 _seq = "" for cds in CDS: print(f">Seq{i} {cds[3]}", file=file) if cds[2] == "+": _seq = cutter(SEQUENCE, cds[0], 25) else: _seq = complement(cutter(SEQUENCE, cds[1], -25)) print(_seq, file=file) i += 1 with open("TRAIN.fasta", "w") as file: # create fasta-file with train-group kws = ["ibosomal", "polymerase", "ranscription", "ranslation"] i = 1 _seq = "" for cds in CDS: if not any(kw in cds[3] for kw in kws): continue print(f">Seq{i} {cds[3]}", file=file) if cds[2] == "+": _seq = cutter(SEQUENCE, cds[0], -26)[:-1] else: _seq = complement(cutter(SEQUENCE, cds[1], 26)[1:]) print(_seq, file=file) i += 1