!RUNS ON LINUX ONLY!
Requires EMBOSS and MAFFT installed
import random
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
Starting with having downloaded fasta and feature table files of Pyrococus abyssi chromosome assembly (NCBI AC AL096836.1)
Obtaining the CDS coordinate table of the P. abyssi chromosome with features2CDSs.py script:
!python3 ~/Programs/my_scripts/features2CDSs.py -i p_abyssi.txt -o p_abyssi.xls
wait.. 1783 CDSs stored ..done
Selecting and filtering CDSs with no "hypothetical" or "putative" in their names:
!grep -E -iv "hypothetical|putative" p_abyssi.xls > temp.xls
good_cds = pd.read_csv("temp.xls", sep="\t", index_col=False)
!rm temp.xls
good_cds.head(n=3)
min_coord | max_coord | ori | pseudo | protein_id | product | |
---|---|---|---|---|---|---|
0 | 1 | 61 | -1 | NaN | NaN | LSU ribosomal protein L1P (rpl1P) |
1 | 125 | 619 | -1 | NaN | NaN | rpl11P LSU ribosomal protein L11P |
2 | 631 | 1089 | -1 | NaN | NaN | nusG transcription antitermination protein |
all(pd.isna(good_cds["pseudo"]))
True
any(pd.isna(good_cds["product"]))
True
good_cds = good_cds[~pd.isna(good_cds["product"])]
Filtering selected CDSs to be at least 300 nt long:
good_cds = good_cds[good_cds["max_coord"] - good_cds["min_coord"] >= 300].reset_index(drop=True)
len(good_cds)
972
Randomly sampling 50 CDS of the filtered set:
random.seed(777)
sampled_cds = good_cds.iloc[random.choices(range(len(good_cds)), k=50)].reset_index(drop=True)
random.seed()
sampled_cds["product"].nunique() == len(sampled_cds)
True
sampled_cds
min_coord | max_coord | ori | pseudo | protein_id | product | |
---|---|---|---|---|---|---|
0 | 362224 | 363492 | 1 | NaN | NaN | Predicted DNA-binding protein |
1 | 670857 | 672062 | -1 | NaN | NaN | dfp DNA/pantothenate metabolism flavoprotein |
2 | 543976 | 544497 | 1 | NaN | NaN | Carbonic anhydrase/acetyltransferase, containi... |
3 | 422356 | 423396 | 1 | NaN | NaN | leuB-2 3-isopropylmalate dehydrogenase |
4 | 504055 | 504501 | 1 | NaN | NaN | rps13P SSU ribosomal protein S13P/S18E |
5 | 1274938 | 1275456 | -1 | NaN | NaN | RNA binding protein, containing PUA domain |
6 | 1629553 | 1630557 | 1 | NaN | NaN | purM phosphoribosylformylglycinamidine cyclo-l... |
7 | 1584032 | 1585261 | 1 | NaN | NaN | Peptidase, M50 family |
8 | 1008006 | 1008602 | 1 | NaN | NaN | phoU phosphate ABC transporter, regulatory pro... |
9 | 575710 | 576507 | 1 | NaN | NaN | Predicted exosome subunit, RNA-binding protein... |
10 | 441245 | 441991 | -1 | NaN | NaN | trpA tryptophan synthase, subunit alpha (EC 4.... |
11 | 1753260 | 1754417 | -1 | NaN | NaN | Sun/NOL1/NOP nucleolar protein |
12 | 1649057 | 1650400 | -1 | NaN | NaN | Glycosyl transferase, family 2 |
13 | 1137420 | 1138271 | 1 | NaN | NaN | dTDP 4-dehydrorhamnose reductase (dTDP-L-rhamn... |
14 | 1519091 | 1520035 | 1 | NaN | NaN | Na+/Ca2+ exchange integral membrane protein |
15 | 168296 | 168742 | 1 | NaN | NaN | moaE molybdopterin synthase, large chain |
16 | 1257727 | 1258335 | 1 | NaN | NaN | iorB indolepyruvate ferredoxin oxidoreductase,... |
17 | 1658027 | 1659223 | -1 | NaN | NaN | 2-amino-3-oxobutanoate synthase (glycine C-ace... |
18 | 631 | 1089 | -1 | NaN | NaN | nusG transcription antitermination protein |
19 | 1044682 | 1046022 | -1 | NaN | NaN | thiD hydroxymethylpyrimidine phosphate kinase |
20 | 505060 | 505473 | 1 | NaN | NaN | rps11P SSU ribosomal protein S11P |
21 | 938160 | 940778 | -1 | NaN | NaN | DEAD/DEAH box RNA helicase |
22 | 172193 | 173311 | 1 | NaN | NaN | Integral membrane protein unknown function |
23 | 1093825 | 1094694 | 1 | NaN | NaN | Heat shock protein/ Zn-dependent protease with... |
24 | 332534 | 333166 | -1 | NaN | NaN | rps3P SSU ribosomal protein S3P |
25 | 784853 | 786049 | -1 | NaN | NaN | aat alanine aminotransferase |
26 | 1374851 | 1376059 | -1 | NaN | NaN | moeA-1 molybdenum cofactor biosynthesis protein |
27 | 1391206 | 1391943 | 1 | NaN | NaN | minD-2 ATPase involved in chromosome partition... |
28 | 942333 | 943322 | 1 | NaN | NaN | Predicted permease |
29 | 58109 | 58666 | -1 | NaN | NaN | hit histidine triad protein |
30 | 995509 | 996693 | -1 | NaN | NaN | thrC threonine synthase |
31 | 925189 | 927012 | 1 | NaN | NaN | aor-2 tungsten-containing aldehyde ferredoxin ... |
32 | 132198 | 133397 | 1 | NaN | NaN | ATPase of the AAA superfamily |
33 | 871941 | 872957 | 1 | NaN | NaN | metE-like2 B12-independent methionine synthase... |
34 | 427237 | 428250 | 1 | NaN | NaN | argE acetyl ornithine deacetylase |
35 | 1311313 | 1312968 | 1 | NaN | NaN | ilvD dihydroxy-acid dehydratase (EC 4.2.1.9) |
36 | 771553 | 772650 | -1 | NaN | NaN | ftsZ-3 cell division GTPase, ftsZ homolog |
37 | 874041 | 875495 | 1 | NaN | NaN | Proline or pantothenate permease |
38 | 267930 | 269027 | -1 | NaN | NaN | pepQ-1 X-pro dipeptidase |
39 | 657848 | 660976 | -1 | NaN | NaN | moaA-like intein containing molybdenum cofacto... |
40 | 344572 | 345990 | -1 | NaN | NaN | acdA-1 acetate--coA ligase (ADP-forming) (EC 6... |
41 | 905815 | 907359 | 1 | NaN | NaN | Major extracellular endo-1,4-betaglucanase pre... |
42 | 150138 | 150851 | 1 | NaN | NaN | Biotin-(acetyl-coA carboxylase) ligase |
43 | 291349 | 292476 | -1 | NaN | NaN | RPA41 subunit of the hetero-oligomeric complex... |
44 | 1691611 | 1692174 | 1 | NaN | NaN | Nucleotide-binding protein, Maf septum formati... |
45 | 1265699 | 1266826 | -1 | NaN | NaN | fbp fructose 1,6-bisphosphatase (EC 3.1.3.11) |
46 | 853742 | 854179 | -1 | NaN | NaN | mmdC methylmalonyl-coA decarboxylase gamma chain |
47 | 1516336 | 1517460 | 1 | NaN | NaN | Methyl-accepting chemotaxis protein |
48 | 1653743 | 1656487 | -1 | NaN | NaN | alaS alanyl-tRNA synthetase |
49 | 462689 | 463402 | -1 | NaN | NaN | pur operon repressor related protein |
Retrieving 40 nt upstream sequences of the sampled CDSs from the chromosome fasta file:
length = 40
print("WORKING\n")
for i in range(len(sampled_cds)):
if sampled_cds.iloc[i]["ori"] == -1:
translation_start = sampled_cds.iloc[i]["max_coord"]
!seqret -auto -sid upstream_{i} p_abyssi.fasta[{translation_start + 1}:{translation_start + length}:r]\
stdout | cat >> upstreams.fasta
else:
translation_start = sampled_cds.iloc[i]["min_coord"]
!seqret -auto -sid upstream_{i} p_abyssi.fasta[{translation_start - length}:{translation_start - 1}]\
stdout | cat >> upstreams.fasta
if (i + 1) % 10 == 0:
print(f"{i + 1} sequences written")
print("\nDONE")
WORKING 10 sequences written 20 sequences written 30 sequences written 40 sequences written 50 sequences written DONE
Adding the reverse complement of 16S rRNA 3'-prime end (containing aSD sequence) to the upstreams:
According to the chromosome's gbk file the 16S rRNA sequence is confined to 205039..206541 positions, however NCBI blastn search across Thermococci genomes has shown the end coordinate to be truncated. 12 Thermococci blastn-found 16S rRNA sequences, annotated as complete, has been obtained as thermococci_16S.fasta to build an alignment with the 16S rRNA of Pyrococcus abyssi:
!seqret -sid 16S p_abyssi.fasta[205039:206541] stdout | cat >> thermococci_16S.fasta
Read and write (return) sequences
!linsi --quiet thermococci_16S.fasta | showalign -\filter -show A -nosimilar -nocons > thermococci_16S_aln.txt
!cat thermococci_16S_aln.txt
10 20 30 40 50 60 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 -------------ttccggttgatcctgccggaggccactgctatgggggtccgactaag NR_121707.1 aata-----gcaattccggttgatcctgccggaggccactgctatcggggtccgactaag NR_102888.1 ------------attccggttgatcctgccggaggccactgctatcggggtccgactaag NR_102853.1 -gtactcccttaattccggttgatcctgccggaggccactgctatgggggtccgactaag NR_074867.1 -------------ttccggttgatcctgccggaggccactgctatgggggtccgactaag NR_074375.1 ------------attccggttgatcctgccggaggccactgctatgggggtccgactaag NR_074373.1 ------------attccggttgatcctgccggaggccactgctatgggggtccgactaag NR_074358.1 ------------attccggttgatcctgccggaggccactgctatgggggtccgactaag NR_042075.1 ------------------------------------------------------------ NR_028216.1 ------------attccggttgatcctgccggaggccactgctatgggggtccgactaag NR_040968.1 ------------attccggttgatcctgccggaggccactgctatgggggtccgactaag NR_040969.1 ------------attccggttgatcctgccggaggccactgctatgggggtccgactaag 16S cgtactcccttaattccggttgatcctgccggaggccactgctatgggggtccgactaag 70 80 90 100 110 120 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 ccatgcgagtcatggggcgcgctc-tgcgcg----caccggcggacggctcagtaacacg NR_121707.1 ccatgcgagtcaagggggtgtccc-tctgggacaccaccggcggacggctcagtaacacg NR_102888.1 ccatgcgagtcacgggggtgtccc-tttggggcaccaccggcggacggctcagtaacacg NR_102853.1 ccatgcgagtcaagggggcgtcccttctgggacgccaccggcggacggctcagtaacacg NR_074867.1 ccatgcgagtcatggggcgcgctc-tgcgcg----caccggcggacggctcagtaacacg NR_074375.1 ccatgcgagtcaagggggcgtcccttctgggacgccaccggcggacggctcagtaacacg NR_074373.1 ccatgcgagtcatggg--gcgcct-tgcgcg----caccggcggacggctcagtaacacg NR_074358.1 ccatgcgagtcaagggggcgtcccttctgggacgccaccggcggacggctcagtaacacg NR_042075.1 --------------------------------------ggcgggacggctcagtaacacg NR_028216.1 ccatgcgagtcatggggcgcgctc-tgcgcg----caccggcggacggctcagtaacacg NR_040968.1 ccatgcgagtcatggggcgcgctc-tgcgcg----caccggcggacggctcagtagcacg NR_040969.1 ccatgcgagtcatggg--gcgcct-tgcgcg----caccggcggacggctcagtaacacg 16S ccatgcgagtcaagggggcgtcccttctgggacgccaccggcggacggctcagtaacacg 130 140 150 160 170 180 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 tcggtaacctaccctcgggagggggataaccccgggaaactggggctaatcccccatagg NR_121707.1 tcggtaacctaccctcgggagggggataaccccgggaaactggggctaatcccccatagg NR_102888.1 tcggtaacctaccctcgggagggggataaccccgggaaactggggctaattccccatagg NR_102853.1 tcggtaacctaccctcgggagggggataaccccgggaaactggggctaatcccccatagg NR_074867.1 tcggtaacctaccctcgggagggggataaccccgggaaactggggctaatcccccatagg NR_074375.1 tcggtaacctaccctcgggagggggataaccccgggaaactggggctaatcccccatagg NR_074373.1 tcggtaacctaccctcgggagggggataaccccgggaaactggggctaatcccccatagg NR_074358.1 tcggtaacctaccctcgggagggggataaccccgggaaactggggctaatcccccatagg NR_042075.1 tcggtaacctaccctcgggagggggataaccccgggaaactggggctaatcccccatagg NR_028216.1 tcggtaacctaccctcgggagggggataaccccgggaaactggggctaatcccccatagg NR_040968.1 tcggtaacctaccctcgggagggggataaccccgggaaactggggctaatcccccatagg NR_040969.1 tcggtaacctaccctcgggagggggataaccccgggaaactggggctaatcccccatagg 16S tcggtaacctaccctcgggagggggataaccccgggaaactggggctaatcccccatagg 190 200 210 220 230 240 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 cctggggtactggaaggtccccaggccgaaaggggc-tc-----tgcccgcccgaggatg NR_121707.1 cctgaggtactggaaggtcctcaggccgaaaggggc-tc-----tgcccgcccgaggatg NR_102888.1 cctgaggtactggaaggtcctcaggccgaaagggac-tta----tgtccgcccgaggatg NR_102853.1 cctggggtactggaaggtccccaggccgaaagggag-ccgtaaggctccgcccgaggatg NR_074867.1 cctgaggtactggaaggtcctcaggccgaaaggggcatc-----tgcccgcccgaggatg NR_074375.1 cctggggtactggaaggtccccaggccgaaagggag-ccgtaaggctccgcccgaggatg NR_074373.1 cctgaggtactggaaggtcctcaggccgaaaggggcttc-----tgcccgcccgaggatg NR_074358.1 cctggggtactggaaggtccccaggccgaaagggag-ccgtaaggctccgcccgaggatg NR_042075.1 cctgaggtactggaaggtcctcaggccgaaaggggc-tc-----tgcccgcccgaggatg NR_028216.1 cctgaggtactggaaggtcctcaggccgaaaggggcatc-----tgcccgcccgaggatg NR_040968.1 cctgaggtactggaaggtcctcaggccgaaaggggcatc-----tgcccgcccgaggatg NR_040969.1 cctgaggtactggaaggtcctcaggccgaaaggggc-tt-----tgcccgcccgaggatg 16S cctggggtactggaaggtccccaggccgaaagggag-ccgtaaggctccgcccgaggatg 250 260 270 280 290 300 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 ggccggcggccgattaggtagttggtggggtaacggcccaccaagccgaagatcggtacg NR_121707.1 ggccggcggccgattaggtagttggtggggtaacggcccaccaagccgaagatcggtacg NR_102888.1 ggccggcggccgattaggtagttggtggggtaatggcccaccaagccgaagatcggtacg NR_102853.1 ggccggcggccgattaggtagttggtggggtaacggcccaccaagccgaagatcggtacg NR_074867.1 ggccggcggccgattaggtagttggtggggtaacggcccaccaagccgaagatcggtacg NR_074375.1 ggccggcggccgattaggtagttggtggggtaacggcccaccaagccgaagatcggtacg NR_074373.1 ggccggcggccgattaggtagttggtggggtaacggcccaccaagccgaagatcggtacg NR_074358.1 ggccggcggccgattaggtagttggtggggtaacggcccaccaagccgaagatcggtacg NR_042075.1 ggccggcggccgattaggtagttggtggggtaacggcccaccaagccgaagatcggtacg NR_028216.1 ggccggcggccgattaggtagttggtggggtaacggcccaccaagccgaagatcggtacg NR_040968.1 ggccggcggccgattaggtagttggtggggtaacggcccaccaagccgaagatcggtacg NR_040969.1 ggccggcggccgattaggtagttggtggggtaacggcccaccaagccgaagatcggtacg 16S ggccggcggccgattaggtagttggtggggtaacggcccaccaagccgaagatcggtacg 310 320 330 340 350 360 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 ggccatgagagtgggagcccggagatggacactgagacacgggtccaggccctacggggc NR_121707.1 ggctgtgagagcaggagcccggagatggacactgagacacgggtccaggccctacggggc NR_102888.1 ggctgtgagagcaggagcccggagatggacactgagacacgggtccaggccctacggggc NR_102853.1 ggccgtgagagcgggagcccggagatggacactgagacacgggtccaggccctacggggc NR_074867.1 ggccatgagagtgggagcccggagatggacactgagacacgggtccaggccctacggggc NR_074375.1 ggccgtgagagcgggagcccggagatggacactgagacacgggtccaggccctacggggc NR_074373.1 ggccatgagagtgggagcccggagatggacactgagacacgggtccaggccctacggggc NR_074358.1 ggccgtgagagcgggagcccggagatggacactgagacacgggtccaggccctacggggc NR_042075.1 ggccatgagagtgggagcccggagatggacactgagacacgggtccaggccctacggggc NR_028216.1 ggccatgagagtgggagcccggagatggacactgagacacgggtccaggccctacggggc NR_040968.1 ggccatgagagtgggagcccggagatggacactgagacacgggtccaggccctacggggc NR_040969.1 ggccatgagagtgggagcccggagatggacactgagacacgggtccaggccctacggggc 16S ggccgtgagagcgggagcccggagatggacactgagacacgggtccaggccctacggggc 370 380 390 400 410 420 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 gcagcaggcgcgaaacctccgcaatgcgggaaaccgcgacggggggacccccagtgccgt NR_121707.1 gcagcaggcgcgaaacctccgcaatgcgggaaaccgcgacggggggaccccgagtgccgt NR_102888.1 gcagcaggcgcgaaacctccgcaatgcgggaaaccgcgacggggggaccccgagtgccgt NR_102853.1 gcagcaggcgcgaaacctccgcaatgcgggaaaccgcgacggggggacccccagtgccgt NR_074867.1 gcagcaggcgcgaaacctccgcaatgcgggcaaccgcgacggggggacccccagtgccgt NR_074375.1 gcagcaggcgcgaaacctccgcaatgcgggaaaccgcgacggggggacccccagtgccgt NR_074373.1 gcagcaggcgcgaaacctccgcaatgcgggaaaccgcgacggggggacccccagtgccgt NR_074358.1 gcagcaggcgcgaaacctccgcaatgcgggaaaccgcgacggggggacccccagtgccgt NR_042075.1 gcagcaggcgcgaaacctccgcaatgcgggaaaccgcgacggggggacccccagtgccgt NR_028216.1 gcagcaggcgcgaaacctccgcaatgcgggcaaccgcgacggggggacccccagtgccgt NR_040968.1 gcagcaggcgcgaaacctccgcaatgcgggcaaccgcgacggggggacccccagtgccgt NR_040969.1 gcagcaggcgcgaaacctccgcaatgcgggcaaccgcgacggggggacccccagtgccgt 16S gcagcaggcgcgaaacctccgcaatgcgggaaaccgcgacggggggacccccagtgccgt 430 440 450 460 470 480 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 ggcacagccacggcttttccggagtgtaaaaagctccgggaataagggctgggcaaggcc NR_121707.1 ggcatcgccacggcttttccggagtgtaaaaagctccgggaataagggctgggcaaggcc NR_102888.1 ggcaacgccacggcttttccggagtgtaaaaagctccgggaataagggctgggcaaggcc NR_102853.1 gcctctggcacggcttttccggagtgtaaaaagctccgggaataagggctgggcaaggcc NR_074867.1 ggcacagccacggcttttccggagtgtaaaaagctccgggaataagggctgggcaaggcc NR_074375.1 gcctctggcacggcttttccggagtgtaaaaagctccgggaataagggctgggcaaggcc NR_074373.1 ggcatcgccacggcttttccggagtgtaaaaagctccgggaataagggctgggcaaggcc NR_074358.1 gcctctggcacggcttttccggagtgtaaaaagctccgggaataagggctgggcaaggcc NR_042075.1 ggcaacgccacggcttttccggagtgtaaaaagctccgggaataagggctgggcaaggcc NR_028216.1 ggcatagccacggcttttccggagtgtaaaaagctccgggaataagggctgggcaaggcc NR_040968.1 ggcatcgccacggcttttccggagtgtaaaaagctccgggaataagggctgggcaaggcc NR_040969.1 ggcatcgccacggcttttccggagtgtaaagagctccgggaataagggctgggcaaggcc 16S gcctctggcacggcttttccggagtgtaaaaagctccgggaataagggctgggcaaggcc 490 500 510 520 530 540 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 ggtggcagccgccgcggtaataccggcggcccaagtggtggccgctattattgggcctaa NR_121707.1 ggtggcagccgccgcggtaataccggcggcccgagtggtggccgctattattgggcctaa NR_102888.1 ggtggcagccgccgcggtaataccggcggcccgagtggtggccgctattattgggcctaa NR_102853.1 ggtggcagccgccgcggtaataccggcggcccgagtggtggccactattattgggcctaa NR_074867.1 ggtggcagccgccgcggtaataccggcggcccaagtggtggccgctattattgggcctaa NR_074375.1 ggtggcagccgccgcggtaataccggcggcccgagtggtggccactattattgggcctaa NR_074373.1 ggtggcagccgccgcggtaataccggcggcccgagtggtggccgctattattgggcctaa NR_074358.1 ggtggcagccgccgcggtaataccggcggcccgagtggtggccactattattgggcctaa NR_042075.1 ggtggcagccgccgcggtaataccggcggcccgagtggtggccgctattattgggcctaa NR_028216.1 ggtggcagccgccgcggtaataccggcggcccgagtggtggccgctattattgggcctaa NR_040968.1 ggtggcagccgccgcggtaataccggcggcccgagtggtggccgctattattgggcctaa NR_040969.1 ggtggcagccgccgcggtaataccggcggcccgagtggtggccgctattattgggcctaa 16S ggtggcagccgccgcggtaataccggcggcccgagtggtggccactattattgggcctaa 550 560 570 580 590 600 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 agcgtccgtagccgggcccgtaagtccctggcgaaatcccacggctcaaccgtggggctt NR_121707.1 agcgtccgtagccgggcccgtaagtccctggcgaaatcccacggctcaaccgtggggctt NR_102888.1 agcgtccgtagccgggcccgtaagtccctggcgaaatcccacggctcaaccgtggggctt NR_102853.1 agcggccgtagccgggcccgtaagtccctggcgaaatcccacggctcaaccgtggggctc NR_074867.1 agcgtccgtagccgggcccgtaagtccctggcgaaatcccacggctcaaccgtggggctt NR_074375.1 agcggccgtagccgggcccgtaagtccctggcgaaatcccacggctcaaccgtggggctc NR_074373.1 agcgtccgtagccgggcccgtaagtccctggcgaaatctcacggctcaaccgtggggctt NR_074358.1 agcggccgtagccgggcccgtaagtccctggcgaaatcccacggctcaaccgtggggctc NR_042075.1 agcgtccgtagccgggcctgtaagtccctggcgaaatcccacggctcaaccgtggggctt NR_028216.1 agcgtccgtagccgggcccgtaagtccctggcgaaatcccacggctcaaccgtggggctt NR_040968.1 agcgtccgtagccgggcccgtaagtccctggcgaaatcccacggctcaaccgtggggctt NR_040969.1 agcgtccgtagccgggcccgtaagtccctggcgaaatcccacggctcaaccgtggggctt 16S agcggccgtagccgggcccgtaagtccctggcgaaatcccacggctcaaccgtggggctc 610 620 630 640 650 660 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 gctggggatactgcgggccttgggaccgggagaggccgggggtacccctggggtaggggt NR_121707.1 gctggggatactgcgggccttgggaccgggagaggcggggggtacccctggggtaggggt NR_102888.1 gctggggatactgcgggtcttgggaccgggagaggcggggggtacccctggggtaggggt NR_102853.1 gctggggatactgcgggccttgggaccgggagaggccgggggtacccccggggtaggggt NR_074867.1 gctggggatactgcgggccttgggaccgggagaggccgggggtacccctggggtaggggt NR_074375.1 gctggggatactgcgggccttgggaccgggagaggccgggggtacccccggggtaggggt NR_074373.1 gctggggatactgcgggccttgggaccgggagaggccgggggtacccctggggtaggggt NR_074358.1 gctggggatactgcgggccttgggaccgggagaggccgggggtacccccggggtaggggt NR_042075.1 gctggggatactgcaggccttgggaccgggagaggccgggggtactcctggggtaggggt NR_028216.1 gctggggatactgcgggccttgggaccgggagaggccgggggtacccctggggtaggggt NR_040968.1 gctggggatactgcgggccttgggaccgggagaggccgggggtacccctggggtaggggt NR_040969.1 gctggggatactgcgggccttgggaccgggagaggcggagggtactcctggggtaggggt 16S gctggggatactgcgggccttgggaccgggagaggccgggggtacccccggggtaggggt 670 680 690 700 710 720 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 gaaatcctataatcccagggggaccgccagtggcgaaggcgcccggctggaacgggtccg NR_121707.1 gaaatcctataatcccagggggaccgccagtggcgaaggcgccccgctggaacgggtccg NR_102888.1 gaaatcctataatcccagggggaccgccagtggcgaaggcgccccgctggaacgggtccg NR_102853.1 gaaatcctataatcccggggggaccgccagtggcgaaggcgcccggctggaacgggtccg NR_074867.1 gaaatcctgtaatcccagggggaccgccagtggcgaaggcgcccggctggaacgggtccg NR_074375.1 gaaatcctataatcccggggggaccgccagtggcgaaggcgcccggctggaacgggtccg NR_074373.1 gaaatcctataatcccagggggaccgccagtggcgaaggcgcccggctggaacgggtccg NR_074358.1 gaaatcctataatcccggggggaccgccagtggcgaaggcgcccggctggaacgggtccg NR_042075.1 gaaatcctataatcccaggaggaccgccagtggcgaaggcgcccggctggaacgggtccg NR_028216.1 gaaatcctataatcccagggggaccgccagtggcgaaggcgcccggctggaacgggtccg NR_040968.1 gaaatcctataatcccagggggaccgccagtggcgaaggcgcccggctggaacgggtccg NR_040969.1 gaaatcctataatcccaggaggaccgccagtggcgaaggcgctccgctggaacgggtccg 16S gaaatcctataatcccggggggaccgccagtggcgaaggcgcccggctggaacgggtccg 730 740 750 760 770 780 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 acggtgagggacgaaggccaggggagcgaaccggattagatacccgggtagtcctggctg NR_121707.1 acggtgagggacgaaggccaggggagcaaaccggattagatacccgggtagtcctggctg NR_102888.1 acggtgagggacgaaggccaggggagcaaaccggattagatacccgggtagtcctggctg NR_102853.1 acggtgagggccgaaggccaggggagcgaaccggattagatacccgggtagtcctggctg NR_074867.1 acggtgagggacgaaggccaggggagcgaaccggattagatacccgggtagtcctggctg NR_074375.1 acggtgagggccgaaggccaggggagcgaaccggattagatacccgggtagtcctggctg NR_074373.1 acggtgagggacgaaggccaggggagcgaaccggattagatacccgggtagtcctggctg NR_074358.1 acggtgagggccgaaggccaggggagcgaaccggattagatacccgggtagtcctggctg NR_042075.1 acggtgagggacgaaggccaggggagcgaaccggattagatacccgggtagtcctggctg NR_028216.1 acggtgagggacgaaggccaggggagcgaaccggattagatacccgggtagtcctggctg NR_040968.1 acggtgagggacgaaggccaggggagcgaaccggattagatacccgggtagtcctggctg NR_040969.1 acggtgagggacgaaggccaggggagcgaaccggattagatacccgggtagtcctggctg 16S acggtgagggccgaaggccaggggagcgaaccggattagatacccgggtagtcctggctg 790 800 810 820 830 840 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 taaaggatgcgggctaggtgtcgggcgagcttcgagctcgcccggtgccggagggaagcc NR_121707.1 taaaggatgcgggctaggtgtcgggtgagcttcgggctcgcccggtgccgtagggaagcc NR_102888.1 taaaggatgcgggctaggtgtcgggtgagcctcgagctcgcccggtgccgtagggaagcc NR_102853.1 taaaggatgcgggctaggtgtcgggcgagcttcgagctcgcccggtgccgtagggaagcc NR_074867.1 taaaggatgcgggctaggtgtcgggcgagcttcgagctcgcccggtgccgaagggaagcc NR_074375.1 taaaggatgcgggctaggtgtcgggcgagcttcgagctcgcccggtgccgtagggaagcc NR_074373.1 taaaggatgcgggctaggtgtcgggcgagcttcgagctcgcccggtgccgaagggaagcc NR_074358.1 taaaggatgcgggctaggtgtcgggcgagcttcgagctcgcccggtgccgtagggaagcc NR_042075.1 taaaggatgcgggctaggtgtcgggtgagctccgagctcgcccggtgccgcagggaagcc NR_028216.1 taaaggatgcgggctaggtgtcgggcgagcttcgagctcgcccggtgccggagggaagcc NR_040968.1 taaaggatgcgggctaggtgtcgggcgagcttcgagctcgcccggtgccgaagggaagcc NR_040969.1 taaaggatgcgggctaggtgtcgggcgagcttcgtgctcgctcggtgccgaagggaagcc 16S taaaggatgcgggctaggtgtcgggcgagcttcgagctcgcccggtgccgtagggaagcc 850 860 870 880 890 900 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 gttaagcccgccgcctggggagtacggccgcaaggctgaaacttaaaggaattggcgggg NR_121707.1 gttaagcccgccgcctggggagtacggccgcaaggctgaaacttaaaggaattggcgggg NR_102888.1 gttaagcccgccgcctggggagtacggccgcaaggctgaaacttaaaggaattggcgggg NR_102853.1 gttaagcccgccgcctggggagtacggccgcaaggctgaaacttaaaggaattggcgggg NR_074867.1 gttaagcccgccgcctggggagtacggccgcaaggctgaaacttaaaggaattggcgggg NR_074375.1 gttaagcccgccgcctggggagtacggccgcaaggctgaaacttaaaggaattggcgggg NR_074373.1 gttaagcccgccgcctggggagtacggccgcaaggctgaaacttaaaggaattggcgggg NR_074358.1 gttaagcccgccgcctggggagtacggccgcaaggctgaaacttaaaggaattggcgggg NR_042075.1 gttaagcccgccgcctggggagtccggccgcaaggctgaaacttaaaggaattggcgggg NR_028216.1 gttaagcccgccgcctggggagtacggccgcaaggctgaaacttaaaggaattggcgggg NR_040968.1 gttaagcccgccgcctggggagtacggccgcaaggctgaaacttaaaggaattggcgggg NR_040969.1 gttaagcccgccgcctggggagtacggccgcaaggctgaaacttaaaggaattggcgggg 16S gttaagcccgccgcctggggagtacggccgcaaggctgaaacttaaaggaattggcgggg 910 920 930 940 950 960 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 gagcactacaaggggtggagcgtgcggtttaattggattcaacgccgggaacctcaccgg NR_121707.1 gagcactacaaggggtggagcgtgcggtttaattggattcaacgccgggaacctcaccgg NR_102888.1 gagcactacaaggggtggagcgtgcggtttaattggattcaacgccgggaacctcaccgg NR_102853.1 gagcactacaaggggtggagcgtgcggtttaattggattcaacgccgggaacctcaccgg NR_074867.1 gagcactacaaggggtggagcgtgcggtttaattggattcaacgccgggaacctcaccgg NR_074375.1 gagcactacaaggggtggagcgtgcggtttaattggattcaacgccgggaacctcaccgg NR_074373.1 gagcactacaaggggtggagcgtgcggtttaattggattcaacgccgggaacctcaccgg NR_074358.1 gagcactacaaggggtggagcgtgcggtttaattggattcaacgccgggaacctcaccgg NR_042075.1 gagcactacaaggggtggagcgtgcggtttaattggattcaacgccgggaacctcaccgg NR_028216.1 gagcactacaaggggtggagcgtgcggtttaattggattcaacgccgggaacctcaccgg NR_040968.1 gagcactacaaggggtggagcgtgcggtttaattggattcaacgccgggaacctcaccgg NR_040969.1 gagcactacaaggggtggagcgtgcggtttaattggattcaacgccgggaacctcaccgg 16S gagcactacaaggggtggagcgtgcggtttaattggattcaacgccgggaacctcaccgg 970 980 990 1000 1010 1020 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 gggcgacggcaggatgaaggccaggctgaaggtcttgccggacgcgccgagaggaggtgc NR_121707.1 gggcgacggcaggatgaaggccaggctgaaggtcttgccggacacgccgagaggaggtgc NR_102888.1 gggcgacggcaggatgaaggccaggctgaaggtcttgccggacacgccgagaggaggtgc NR_102853.1 gggcgacggcaggatgaaggccaggctgaaggtcttgccggacgcgccgagaggaggtgc NR_074867.1 gggcgacggcaggatgaaggccaggctgaaggtcttgccggacgcgccgagaggaggtgc NR_074375.1 gggcgacggcaggatgaaggccaggctgaaggtcttgccggacgcgccgagaggaggtgc NR_074373.1 gggcgacggcaggatgaaggccaggctgaaggtcttgccggacacgccgagaggaggtgc NR_074358.1 gggcgacggcaggatgaaggccaggctgaaggtcttgccggacgcgccgagaggaggtgc NR_042075.1 gggcgacggcaggatgaaggccaggctgaaggtcttgccggacacgccgagaggaggtgc NR_028216.1 gggcgacggcaggatgaaggccaggctgaaggtcttgccggacacgccgagaggaggtgc NR_040968.1 gggcgacggcaggatgaaggccaggctgaaggtcttgccggacacgccgagaggaggtgc NR_040969.1 gggcgacggcaggatgaaggccaggctgaaggtcttgccggacacgccgagaggaggtgc 16S gggcgacggcaggatgaaggccaggctgaaggtcttgccggacgcgccgagaggaggtgc 1030 1040 1050 1060 1070 1080 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 atggccgccgtcagctcgtaccgtgaggcgtccacttaagtgtggtaacgagcgagaccc NR_121707.1 atggccgccgtcagctcgtaccgtgaggcgtccacttaagtgtggtaacgagcgagaccc NR_102888.1 atggccgccgtcagctcgtaccgtgaggcgtccacttaagtgtggtaacgagcgagaccc NR_102853.1 atggccgccgtcagctcgtaccgtgaggcgtccacttaagtgtggtaacgagcgagaccc NR_074867.1 atggccgccgtcagctcgtaccgtgaggcgtccacttaagtgtggtaacgagcgagaccc NR_074375.1 atggccgccgtcagctcgtaccgtgaggcgtccacttaagtgtggtaacgagcgagaccc NR_074373.1 atggccgccgtcagctcgtaccgtgaggcgtccacttaagtgtggtaacgagcgagaccc NR_074358.1 atggccgccgtcagctcgtaccgtgaggcgtccacttaagtgtggtaacgagcgagaccc NR_042075.1 atggccgccgtcagctcgtaccgtgaggcgtccacttaagtgtggtaacgagcgagaccc NR_028216.1 atggccgccgtcagctcgtaccgtgaggcgtccacttaagtgtggtaacgagcgagaccc NR_040968.1 atggccgccgtcagctcgtaccgtgaggcgtccacttaagtgtggtaacgagcgagaccc NR_040969.1 atggccgccgtcagctcgtaccgtgaggcgtccacttaagtgtggtaacgagcgagaccc 16S atggccgccgtcagctcgtaccgtgaggcgtccacttaagtgtggtaacgagcgagaccc 1090 1100 1110 1120 1130 1140 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 gcgcccccagttgccagtcctccccgct-ggggaggaggcactctggggggaccgccggc NR_121707.1 gcgcccccagttgccagcccttcccgct-gggaagggggcactctggggggactgccggc NR_102888.1 gtgcccccagttgccagcccttcccgtt-gggaagggggcactctggggggactgccggc NR_102853.1 gcgcccccagttgccagtccctcccgctcgggagggaggcactctggggggactgccggc NR_074867.1 gcgcccccagttgccagtcctccccgct-ggggaggaggcactctggggggaccgccggc NR_074375.1 gcgcccccagttgccagtccctcccgctcgggagggaggcactctggggggactgccggc NR_074373.1 gcgcccccagttgccagtcctccccgtt-ggggaggaggcactctggggggaccgccggc NR_074358.1 gcgcccccagttgccagtccctcccgctcgggagggaggcactctggggggactgccggc NR_042075.1 gtgcccccagttgccagtccttcccgct-gggagggaggcactctggggggaccgccggc NR_028216.1 gcgcccccagttgccagtcctccccgct-ggggaggaggcactctggggggaccgccggc NR_040968.1 gcgcccccagttgccagtcctccccgct-ggggaggaggcactctggggggaccgccggc NR_040969.1 gcgcccccagttgccagtccttcccgct-ggggaggaggcactctggggggaccgccggc 16S gcgcccccagttgccagtccctcccgctcgggagggaggcactctggggggactgccggc 1150 1160 1170 1180 1190 1200 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 gataagccggaggaaggagcgggcgacggtaggtcagtatgccccgaaacccccgggcta NR_121707.1 gataagccggaggaaggagcgggcgacggtaggtcagtatgccccgaaacccccgggcta NR_102888.1 gataagccggaggaaggagcgggcgacggtaggtcagtatgccccgaaacccccgggcta NR_102853.1 gataagccggaggaaggggcgggcgacggtaggtcagtatgccccgaaacccccgggcta NR_074867.1 gataagccggaggaaggagcgggcgacggtaggtcagtatgccccgaaacccccgggcta NR_074375.1 gataagccggaggaaggggcgggcgacggtaggtcagtatgccccgaaacccccgggcta NR_074373.1 gataagccggaggaaggagcgggcgacggtaggtcagtatgccccgaaacccccgggcta NR_074358.1 gataagccggaggaaggggcgggcgacggtaggtcagtatgccccgaaacccccgggcta NR_042075.1 gataagccggaggaaggagcgggcgacggtaggtcagtatgccccgaaacccccgggcta NR_028216.1 gataagccggaggaaggagcgggcgacggtaggtcagtatgccccgaaacccccgggcta NR_040968.1 gataagccggaggaaggagcgggcgacggtaggtcagtatgccccgaaacccccgggcta NR_040969.1 gataagccggaggaaggagcgggcgacggtaggtcagtatgccccgaaacccccgggcta 16S gataagccggaggaaggggcgggcgacggtaggtcagtatgccccgaaacccccgggcta 1210 1220 1230 1240 1250 1260 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 cacgcgcgctacaatgggcgggacaatgggaaccgaccccgaaaggggaagggaatcccc NR_121707.1 cacgcgcgctacaatgggcgggacaatgggatccgaccccgaaaggggaagggaatcccc NR_102888.1 cacgcgcgctacaatgggtgggacaatgggatccgaccccgaaaggggaagggaatcccc NR_102853.1 cacgcgcgctacaatgggcgggacaatgggacccgaccccgaaaggggaagggaatcccc NR_074867.1 cacgcgcgctacaatgggcgggacaatgggatccgaccccgaaaggggaagggaatcccc NR_074375.1 cacgcgcgctacaatgggcgggacaatgggacccgaccccgaaaggggaagggaatcccc NR_074373.1 cacgcgcgctacaatgggcgggacaatgggatccgaccccgaaaggggaagggaatcccc NR_074358.1 cacgcgcgctacaatgggcgggacaatgggtgccgaccccgaaagggggaggtaatcccc NR_042075.1 cacgcgcgctacaatgagcgggacaatgggatccgaccccgaaaggggaagggaatcccc NR_028216.1 cacgcgcgctacaatgggcgggacaatgggatccgaccccgaaaggggaagggaatcccc NR_040968.1 cacgcgcgctacaatgggcgggacaatgggatccgaccccgaaaggggaagggaatcccc NR_040969.1 cacgcgcgctacaatgagcgggacaatgggatccgaccccgaaaggggaaggtaatcccc 16S cacgcgcgctacaatgggcgggacaatgggtgccgaccccgaaagggggaggtaatcccc 1270 1280 1290 1300 1310 1320 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 taaacccgccctcagttcggatcgcgggctgcaactcgcccgcgtgaagctggaatccct NR_121707.1 taaacccgcccccagttcggatcgcgggctgcaactcgcccgcgtgaagctggaatccct NR_102888.1 taaacccgcccccagttcggatcgcgggctgcaactcgcccgcgtgaagctggaatccct NR_102853.1 taaacccgccctcagttcggatcgcgggctgcaactcgcccgcgtgaagctggaatccct NR_074867.1 taaacccgccctcagttcggatcgcgggctgcaactcgcccgcgtgaagctggaatccct NR_074375.1 taaacccgccctcagttcggatcgcgggctgcaactcgcccgcgtgaagctggaatccct NR_074373.1 taaacccgccctcagttcggatcgcgggctgcaactcgcccgcgtgaagctggaatccct NR_074358.1 taaacccgccctcagttcggatcgcgggctgcaactcgcccgcgtgaagctggaatccct NR_042075.1 taaacccgctcccagttcggatcgcgggctgcaactcgcccgcgtgaagctggaatccct NR_028216.1 taaacccgccctcagttcggatcgcgggctgcaactcgcccgcgtgaagctggaatccct NR_040968.1 taaacccgcccccagttcggatcgcgggctgcaactcgcccgcgtgaagctggaatccct NR_040969.1 taaacccgctcccagttcggatcgcgggctgcaactcgcccgcgtgaagctggaatccct 16S taaacccgccctcagttcggatcgcgggctgcaactcgcccgcgtgaagctggaatccct 1330 1340 1350 1360 1370 1380 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 agtacccgcgtgtcatcatcgcgcggcgaatacgtccctgctccttgcacacaccgcccg NR_121707.1 agtacccgcgtgtcatcatcgcgcggcgaatacgtccctgctccttgcacacaccgcccg NR_102888.1 agtacccgcgtgtcatcatcgcgcggcgaatacgtccctgctccttgcacacaccgcccg NR_102853.1 agtacccgcgcgtcatcatcgcgcggcgaatacgtccctgctccttgcacacaccgcccg NR_074867.1 agtacccgcgtgtcatcatcgcgcggcgaatacgtccctgctccttgcacacaccgcccg NR_074375.1 agtacccgcgtgtcatcatcgcgcggcgaatacgtccctgctccttgcacacaccgcccg NR_074373.1 agtacccgcgtgtcatcatcgcgcggcgaatacgtccctgctccttgcacacaccgcccg NR_074358.1 agtacccgcgcgtcatcatcgcgcggcgaatacgtccctgctccttgcacacaccgcccg NR_042075.1 agtacccgcgtgtcatcatcgcgcggcgaatacgtccctgctccttgcacacaccgcccg NR_028216.1 agtacccgcgtgtcatcatcgcgcggcgaatacgtccctgctccttgcacacaccgcccg NR_040968.1 agtacccgcgtgtcatcatcgcgcggcgaatacgtccctgctccttgcacacaccgcccg NR_040969.1 agtacccgcgtgtcatcatcgcgcggcgaatacgtccctgctccttgcacacaccgcccg 16S agtacccgcgcgtcatcatcgcgcggcgaatacgtccctgctccttgcacacaccgcccg 1390 1400 1410 1420 1430 1440 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 tcactccacccgagcggggtccgggtgaggcctggtctcccttcggggaggccgggtcga NR_121707.1 tcactccacccgagcggggtctggatgaggctccgtcctct-------gggcggggtcga NR_102888.1 tcactccacccgagcggggtctggatgaggctccgtcctct-------gggcagagtcga NR_102853.1 tcactccacccgagcggggtccgggtgaggcccagtctccttc--gggaggctgggtcga NR_074867.1 tcactccacccgagcggggtccgggtgaggcctggtctcccttcggggaggccgggtcga NR_074375.1 tcactccacccgagcggagtccgggtgaggcctgatctccttc--gggaggtcaggtcga NR_074373.1 tcactccacccgagcggggtccgggtgaggcctgatctcccctcggggaggtcgggtcga NR_074358.1 tcactccacccgagcggggcctaggtgaggcccgatctccttc--gggaggtcgggtcga NR_042075.1 tcactccacccgagcggggtctggatgaggctcgatctcccttcggggaggccgggtcga NR_028216.1 tcactccacccgagcggggtccgggtgaggcctggtctcccttcggggaggccgggtcga NR_040968.1 tcactccacccgagcggggtctggatgaggcctgatctcccttcggggaggtcgggtcga NR_040969.1 tcactccacccgagcggggtctggatgaggcctgatctcccttcggggaggccgggtcga 16S tcactccacccgagcggggcctaggtgaggcccgatctccttc--gggaggtcgggtcga 1450 1460 1470 1480 1490 1500 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 gcctgggctccgtgaggggggagaagtcgtaacaaggtagccgtaggggaacctacggct NR_121707.1 gtccaggctccgtgaggggggagaagtcgtaacaaggtagccgtaggggaacctacggct NR_102888.1 gtccaggctccgtgaggggggagaagtcgtaacaaggtagccgtaggggaacctacggct NR_102853.1 gcccgggctccgtgaggggggagaagtcgtaacaaggtagccgtaggggaacctacggct NR_074867.1 gcctgggctccgtgaggggggagaagtcgtaacaaggtagccgtaggggaacctacggct NR_074375.1 gcccgggctccgtgaggggggagaagtcgtaacaaggtagccgtaggggaacctacggct NR_074373.1 gcctgggctccgtgaggggggagaagtcgtaacaaggtagccgtaggggaacctacggct NR_074358.1 gcctaggctccgtgaggggggagaagtcgtaacaaggtagccgtaggggaacctacggct NR_042075.1 gtccgggctccgtgaggggggagaagtcgtaacaaggtagccgtaggggaacctacggct NR_028216.1 gcctgggctccgtgaggggggagaagtcgtaacaaggtagccgtaggggaacctacggct NR_040968.1 gtctgggctccgtgaggggggagaagtcgtaacaaggtagccgtaggggaacctacggct NR_040969.1 gtctgggctccgtgaggggggagaagtcgtaacaaggtagccgtaggggaacctacggct 16S gcctaggctccgtgaggggggagaagtcgtaacaaggtagccgtaggggaacctacggct 1510 1520 1530 1540 1550 1560 ----:----|----:----|----:----|----:----|----:----|----:----| NR_121795.1 cgatcacctcctatcgccggaaacccgtccgggggggtttaaggggtgccgggcctgcca NR_121707.1 cgatcacctcctatcgccgga--------------------------------------- NR_102888.1 cgatcacctcct------------------------------------------------ NR_102853.1 cgatcacctcctatcgccggaa-------------------------------------- NR_074867.1 cgatcacctcctatcgccggaaacccgtcc-ggggggtttaagggatgtcgggcctgcct NR_074375.1 cgatcacctcct------------------------------------------------ NR_074373.1 cgatcacctcct------------------------------------------------ NR_074358.1 cgatcacctc-------------------------------------------------- NR_042075.1 cgatcacctcctatcgccgga--------------------------------------- NR_028216.1 cgatcacctcctatcgccgga--------------------------------------- NR_040968.1 cgatcacctcct------------------------------------------------ NR_040969.1 cgatcacctcct------------------------------------------------ 16S cgatca------------------------------------------------------ ----: NR_121795.1 acgtg NR_121707.1 ----- NR_102888.1 ----- NR_102853.1 ----- NR_074867.1 tca-g NR_074375.1 ----- NR_074373.1 ----- NR_074358.1 ----- NR_042075.1 ----- NR_028216.1 ----- NR_040968.1 ----- NR_040969.1 ----- 16S -----
So, based on the alignment above, coordinate 206547 has been chosen as the end:
!seqret -sid 16S_rev_comp p_abyssi.fasta[{206547 - 39}:206547:r] stdout | cat >> upstreams.fasta
Read and write (return) sequences
upstreams.fasta file, comprising 40 nt upstreams of 50 protein coding genes and 40 nt reverse-complemented 16S rRNA gene of P. abyssi, is now passed to the MEME program for motif finding:
To find the short motif MEME is run with the options: classic mode, zoops, 1 motif to find, 0-order background model, motif strictly 5 nt long, search given strand only
To find the long motif MEME is run with the options: classic mode, zoops, 1 motif to find, 0-order background model, motif 5-10 nt long, search given strand only
Randomly sampling 500 CDSs from the 972 good CDSs and, provided that SD motif sites are expected to be found somewhere within 20 nt upstream to start-codons, taking their 20 nt upstream sequences:
random.seed(777)
small_dataset = good_cds.iloc[random.sample(range(len(good_cds)), k=500)].reset_index(drop=True)
random.seed()
l = 20
small_dataset["min_coord"] = small_dataset.apply(lambda x:
x["min_coord"] - l if x["ori"] == 1 else x["max_coord"] + 1, axis=1)
small_dataset["max_coord"] = small_dataset.apply(lambda x:
x["min_coord"] + l - 1, axis=1)
small_dataset.drop(columns=["pseudo", "protein_id"], inplace=True)
small_dataset.to_csv("small_dataset.xls", sep="\t", index=False)
Making a fasta file of the selected upstreams:
!python3 /home/flaumberg/Programs/my_scripts/fragments2fasta.py -i small_dataset.xls -f p_abyssi.fasta\
-o 500_good_cds_20.fasta
wait.. fragment coordinates from small_dataset.xls fragment sequences from p_abyssi.fasta writing fragments to 500_good_cds_20.fasta 5 fragments written 10 fragments written 50 fragments written 100 fragments written 150 fragments written 200 fragments written 300 fragments written 400 fragments written 500 fragments written 500 fragments have been written ..done
500_good_cds_20.fasta, comprising the 500 upstream 20 nt long sequences selected, is now passed to the FIMO program for motif searching with the help of the built PWMs:
To search for the short (more strict, 5 nt long) motif FIMO is run with the options: PWM SD_short.txt, scan given strand only, p-value threshold 0.001 (and then 0.01)
To search for the long (less strict, 9 nt long) motif FIMO is run with the options: PWM SD_long.txt, scan given strand only, p-value threshold 0.001 (and then 0.01)
As while using p-value threshold 0.001 with both PWMs there have been found much less motifs than expected from previous studies, for the next analysis only the sets of motifs found by using threshold 0.01 (fimo_20_short_01.tsv and fimo_20_long_01.tsv) are considered
FDR has been chosen to be 0.1, so all the findings with q-value above 0.1 are being dismissed:
motifs_good_short = pd.read_csv("fimo_good_20_short_01.tsv", sep="\t", index_col=False)
motifs_good_short = motifs_good_short[motifs_good_short["q-value"] <= 0.1]
motifs_good_short.shape
(450, 10)
motifs_good_short["sequence_name"].nunique()
344
There are some sequences with more than 1 motif found, for these CDS upstreams a motif with the best p-value is being chosen:
motifs_good_short = motifs_good_short.iloc[motifs_good_short.groupby("sequence_name")["p-value"].idxmin(), :]
motifs_good_short["matched_sequence"].value_counts()
GGTGA 131 GGAGG 79 GGTGG 50 GGGGA 29 GGAGA 24 GGTGT 20 GGTGC 7 GGCGA 4 Name: matched_sequence, dtype: int64
The most of sites found seem reliable, except for the 11 ones having a C nt in their sequences.
A histogram of distances to the start codon is shown below:
plt.figure(figsize=[9, 7])
sns.histplot(x=-(20 - motifs_good_short["start"] - 2), discrete=True)
plt.grid(True, axis="y")
plt.xticks(np.linspace(-9, 0, 10) * 2, size=12)
plt.yticks(size=12)
plt.xlabel("position", size=14)
plt.ylabel("count", size=14)
plt.title("Position of the motif GGTGA 3rd nt relative to start-codon", size=16)
plt.savefig("motifs_good_short_distance.png", dpi=300)
Extracting the sequences of the motifs found and filtered to build the motif LOGO:
with open("motifs_good_short.txt", "w") as in_file:
for seq in motifs_good_short["matched_sequence"]:
in_file.write(f"{seq}\n")
The LOGO built with no adjustment for composition:
%%html
<img src="motifs_good_short.svg", style="height:300px">
Making a table for protein coding genes with the SD sites found:
SD_good_short = motifs_good_short["sequence_name"].apply(lambda x:
small_dataset.iloc[
np.where((small_dataset["min_coord"].apply(str) + "_" +
small_dataset["max_coord"].apply(str) + "_" +
("-1" if x[-2:] == "-1" else "1")) == x[15:])[0][0], [0, 1, 2, -1]])
SD_good_short["SD_sequence"] = motifs_good_short["matched_sequence"]
SD_good_short["SD_to_start_distance"] = 20 - motifs_good_short["stop"]
SD_good_short.reset_index(drop=True)
SD_good_short.to_csv("SD_good_short.tsv", sep="\t", index=False)
FDR has been chosen to be 0.1, so all the findings with q-value above 0.1 are being dismissed:
motifs_good_long = pd.read_csv("fimo_good_20_long_01.tsv", sep="\t", index_col=False)
motifs_good_long = motifs_good_long[motifs_good_long["q-value"] <= 0.1]
motifs_good_long.shape
(555, 10)
motifs_good_long["sequence_name"].nunique()
329
There are some sequences with more than 1 motif found, for these CDS upstreams a motif with the best p-value is being chosen:
motifs_good_long = motifs_good_long.iloc[motifs_good_long.groupby("sequence_name")["p-value"].idxmin(), :]
motifs_good_long["matched_sequence"].value_counts()
GGAGGTGAT 8 TGAGGTGAT 7 GGGGGTGAT 6 GGAGGTGAA 6 TGAGGTGGT 5 .. GGAGGCGAG 1 CATGGTGAT 1 AGAGGGGAA 1 GGAGGGGCG 1 AAAGGTGGA 1 Name: matched_sequence, Length: 232, dtype: int64
As the motif, that has been searched for, is 9 nt long, there are plenty of matching variants.
To compare the performance of the long PWM with that of the short one, the frequencies of the most expected variants of the SD conserved region, as well as the frequency of a quite unxepected C nt in this region, are being estimated:
print(f'contain GGTGA\t\t\t\t{motifs_good_long["matched_sequence"].apply(lambda x: "GGTGA" in x[-6:-1]).sum()}')
print(f'contain GGTGA/GGAGG/GGTGG/GGGGA\t\t\
{motifs_good_long["matched_sequence"].apply(lambda x: ("GGTGA" in x[-6:-1]) or ("GGAGG" in x[-6:-1]) or ("GGTGG" in x[-6:-1]) or ("GGGGA") in x[-6:-1]).sum()}')
print(f'contain C\t\t\t\t{motifs_good_long["matched_sequence"].apply(lambda x: "C" in x[-6:-1]).sum()}')
contain GGTGA 105 contain GGTGA/GGAGG/GGTGG/GGGGA 174 contain C 43
A histogram of distances to the start-codon is shown below:
plt.figure(figsize=[9, 7])
sns.histplot(x=-(20 - motifs_good_long["start"] - 5), discrete=True)
plt.grid(True, axis="y")
plt.xticks(np.linspace(-7, 0, 8) * 2, size=12)
plt.yticks(size=12)
plt.xlabel("position", size=14)
plt.ylabel("count", size=14)
plt.title("Position of the motif GGAGGTGAK 6th nt relative to start-codon", size=16)
plt.savefig("motifs_good_long_distance.png", dpi=300)
Extracting the sequences of the motifs found and filtered to build the motif LOGO:
with open("motifs_good_long.txt", "w") as in_file:
for seq in motifs_good_long["matched_sequence"]:
in_file.write(f"{seq}\n")
The LOGO is built with no adjustment for composition:
%%html
<img src="motifs_good_long.svg" style="height:300px">
Making a table for protein coding genes with the SD sites found:
SD_good_long = motifs_good_long["sequence_name"].apply(lambda x:
small_dataset.iloc[
np.where((small_dataset["min_coord"].apply(str) + "_" +
small_dataset["max_coord"].apply(str) + "_" +
("-1" if x[-2:] == "-1" else "1")) == x[15:])[0][0], [0, 1, 2, -1]])
SD_good_long["SD_sequence"] = motifs_good_long["matched_sequence"]
SD_good_long["SD_to_start_distance"] = 20 - motifs_good_long["stop"]
SD_good_long.reset_index(drop=True)
SD_good_long.to_csv("SD_good_long.tsv", sep="\t", index=False)
Using the CDS table, obtained earlier with the features2CDSs.py script:
large_dataset = pd.read_csv("p_abyssi.xls", sep="\t", index_col=False)
l_up = 20
l_down = 15
large_dataset["min_coord"] = large_dataset.apply(lambda x:
x["min_coord"] - l_up if x["ori"] == 1 else x["max_coord"] - l_down + 1,
axis=1)
large_dataset["max_coord"] = large_dataset.apply(lambda x:
x["min_coord"] + l_up + l_down - 1, axis=1)
large_dataset.drop(columns=["pseudo", "protein_id"], inplace=True)
large_dataset.to_csv("large_dataset.xls", sep="\t", index=False)
Making a fasta file of all the upstreams:
!python3 /home/flaumberg/Programs/my_scripts/fragments2fasta.py -i large_dataset.xls -f p_abyssi.fasta\
-o all_cds_20_15.fasta
wait.. fragment coordinates from large_dataset.xls fragment sequences from p_abyssi.fasta writing fragments to all_cds_20_15.fasta 5 fragments written 10 fragments written 50 fragments written 100 fragments written 150 fragments written 200 fragments written 300 fragments written 400 fragments written 500 fragments written 600 fragments written 700 fragments written 800 fragments written 900 fragments written 1000 fragments written 1100 fragments written 1200 fragments written 1300 fragments written 1400 fragments written 1500 fragments written 1600 fragments written 1700 fragments written 1783 fragments have been written ..done
all_cds_20_15.fasta file, comprising 20 nt upstream - 15 nt downstream sequences of all the 1783 CDSs, is now passed to the FIMO program for motif searching with the help of the short (the more strict) PWM:
FIMO is run with the options: PWM SD_short.txt, scan given strand only, p-value threshold 0.001 (and then 0.01)
As while using 0.001 as the p-value cutoff, there have been find only 587 sites all of them having the same sequence, for the next analysis only the set of sites found by using the p-value cutoff 0.01 (fimo_all_20_15_01.tsv) is considered
FDR has been chosen to be 0.1, so all the findings with q-value above 0.1 are being dismissed:
motifs_all_short = pd.read_csv("fimo_all_20_15_01.tsv", sep="\t", index_col=False)
motifs_all_short = motifs_all_short[motifs_all_short["q-value"] <= 0.1]
motifs_all_short.shape
(1602, 10)
motifs_all_short["sequence_name"].nunique()
1161
There are some sequences with more than 1 motif found, for these CDS upstreams a motif with the best p-value is being chosen:
motifs_all_short = motifs_all_short.iloc[motifs_all_short.groupby("sequence_name")["p-value"].idxmin(), :]
motifs_all_short["matched_sequence"].value_counts()
GGTGA 560 GGAGG 247 GGTGG 240 GGAGA 114 Name: matched_sequence, dtype: int64
All the sites found seem to be reliable according to their sequences.
A histogram of the distances to annotated start-codon is shown below:
plt.figure(figsize=[9, 7])
sns.histplot(x=-(20 - motifs_all_short["start"] - 2), discrete=True)
plt.grid(True, axis="y")
plt.xticks(np.linspace(-9, 7, 17) * 2, size=12)
plt.yticks(size=12)
plt.ylim(0, 200)
plt.xlabel("position", size=14)
plt.ylabel("count", size=14)
plt.title("Position of the motif GGTGA 3th nt relative to annotated start-codon", size=16)
plt.text(1, 0, "A", size=20, ha="center", color="red")
plt.text(2, 0, "T", size=20, ha="center", color="red")
plt.text(3, 0, "G", size=20, ha="center", color="red")
plt.savefig("motifs_all_short_distance.png", dpi=300)
It can be seen there is a significant amount of SD sites found upstream of start-codon (marked with "ATG" in the histogram). Some of that finding might truly be the SD sites, implying wrong start-codon annotation, while the others should be the artifacts of multiplicity problem (FDR = 0.1, which allows as many as 10% of sites to be false positives)
Making a table for all the protein coding genes of P. abyssi GE5 chromosome with the SD sites found:
SD_all_short = motifs_all_short["sequence_name"].apply(lambda x:
large_dataset.iloc[
np.where((large_dataset["min_coord"].apply(str) + "_" +
large_dataset["max_coord"].apply(str) + "_" +
("-1" if x[-2:] == "-1" else "1")) == x[15:])[0][0], [0, 1, 2, -1]])
SD_all_short["SD_sequence"] = motifs_all_short["matched_sequence"]
SD_all_short["SD_to_start_distance"] = 20 - motifs_all_short["stop"]
SD_all_short.reset_index(drop=True)
SD_all_short.to_csv("SD_all_short.tsv", sep="\t", index=False)
The check of the unique sequence distribution of sites for those ones found downstream to annotated start-codon hasn't shown anything special, so there is no great reason to consider that sites false positives:
motifs_all_short[motifs_all_short["stop"] > 20]["matched_sequence"].value_counts()
GGTGA 130 GGTGG 57 GGAGA 51 GGAGG 34 Name: matched_sequence, dtype: int64
It is interesting whether there is a greater share of poorly annotated genes among those with SD found downstream to annotated start-codon. Hence applying the chi2-test for independence of variables:
import scipy.stats as sps
obs = pd.crosstab(SD_all_short["SD_to_start_distance"] < 0,
SD_all_short["product"].apply(lambda x: pd.isna(x) or "hypothetical" in x.lower())) #or "putative" in x.lower()))
obs.index.name = "wrong start"
obs.columns.name = "bad name"
obs
bad name | False | True |
---|---|---|
wrong start | ||
False | 632 | 257 |
True | 183 | 89 |
print("Contingency table chi2-test for independence of variables:\n")
print("chi2-statistic = {0[0]}, dof = {0[2]}, p-value = {0[1]}".format(sps.chi2_contingency(obs)))
Contingency table chi2-test for independence of variables: chi2-statistic = 1.2699990670400303, dof = 1, p-value = 0.25976685492204077
The test has shown no significant dependency between quality of gene name annotation and the position of SD site, which undermines the assumption of wrong start-codon annotation
Extracting the sequences of the motifs found and filtered to build the motif LOGO:
with open("motifs_all_short.txt", "w") as in_file:
for seq in motifs_all_short["matched_sequence"]:
in_file.write(f"{seq}\n")
The LOGO is built with no adjustment for composition:
%%html
<img src="motifs_all_short.svg" style="height:300px">
Analysis complete