import gzip
#Haloferax volcanii = hv
#Halobacterium salinarum = hs
with gzip.open('UP000000554.swiss.gz', 'rt') as hs:
entries_hs = ''.join(hs.readlines()).split('\nID')
ec1_hs = 0
ec2_hs = 0
ec3_hs = 0
ec4_hs = 0
ec5_hs = 0
ec6_hs = 0
ec7_hs = 0
for entry in entries_hs:
if entry.find('Plasmid pNRC200') != -1:
if entry.find('EC=1.') != -1:
ec1_hs += 1
if entry.find('EC=2.') != -1:
ec2_hs += 1
if entry.find('EC=3.') != -1:
ec3_hs += 1
if entry.find('EC=4.') != -1:
ec4_hs += 1
if entry.find('EC=5.') != -1:
ec5_hs += 1
if entry.find('EC=6.') != -1:
ec6_hs += 1
if entry.find('EC=7.') != -1:
ec7_hs += 1
with gzip.open('UP000008243.swiss.gz', 'rt') as hv:
entries_hv = ''.join(hv.readlines()).split('\nID')
ec1_hv = 0
ec2_hv = 0
ec3_hv = 0
ec4_hv = 0
ec5_hv = 0
ec6_hv = 0
ec7_hv = 0
for entry in entries_hv:
if entry.find('Plasmid pHV4') != -1:
if entry.find('EC=1.') != -1:
ec1_hv += 1
if entry.find('EC=2.') != -1:
ec2_hv += 1
if entry.find('EC=3.') != -1:
ec3_hv += 1
if entry.find('EC=4.') != -1:
ec4_hv += 1
if entry.find('EC=5.') != -1:
ec5_hv += 1
if entry.find('EC=6.') != -1:
ec6_hv += 1
if entry.find('EC=7.') != -1:
ec7_hv += 1
print(f'oxidoreductases in reference proteome: {ec1_hs}, in H.volcanii: {ec1_hv}')
print(f'transferases in reference proteome: {ec2_hs}, in H.volcanii: {ec2_hv}')
print(f'hydrolases in reference proteome: {ec3_hs}, in H.volcanii: {ec3_hv}')
print(f'hydrolases in reference proteome: {ec4_hs}, in H.volcanii: {ec4_hv}')
print(f'isomerases in reference proteome: {ec5_hs}, in H.volcanii: {ec5_hv}')
print(f'ligases in reference proteome: {ec6_hs}, in H.volcanii: {ec6_hv}')
print(f'translocases in reference proteome: {ec7_hs}, in H.volcanii: {ec7_hv}')
oxidoreductases in reference proteome: 3, in H.volcanii: 20 transferases in reference proteome: 9, in H.volcanii: 23 hydrolases in reference proteome: 5, in H.volcanii: 25 hydrolases in reference proteome: 2, in H.volcanii: 13 isomerases in reference proteome: 1, in H.volcanii: 5 ligases in reference proteome: 3, in H.volcanii: 7 translocases in reference proteome: 1, in H.volcanii: 0
def keywords(entries, location):
functions = ''
for entry in entries:
if entry.find(location) != -1:
start = entry.find('\nKW')+4
end1 = entry.find('\nFT')
end2 = entry.find('\nSQ')
if start != -1:
if end1 != -1:
function = ';'.join(entry[start:end1].split('KW'))
if end2 != -1 and end1 == -1:
function = ';'.join(entry[start:end2].split('\nSQ'))
functions += function
unordered_list_KW = [f.strip() for f in functions.split(';') if f.strip() != '']
for i in range(len(unordered_list_KW)):
keyword = unordered_list_KW[i]
if keyword.find('{') != -1:
unordered_list_KW[i] = keyword[:keyword.find('{')].strip()
res = dict()
for f in unordered_list_KW:
res[f] = res.get(f, 0) + 1
return [x for x in sorted(res.items(), key=lambda x:x[1], reverse=True) if x[0][:3] !='ECO']
#Haloferax volcanii = hv
print(*keywords(entries_hv, 'Plasmid pHV4'), sep='\n')
('Reference proteome', 440) ('Plasmid', 283) ('KW Reference proteome', 120) ('Transmembrane', 103) ('Transmembrane helix', 103) ('ProRule:PRU00434}', 36) ('Nucleotide-binding', 31) ('Transport', 25) ('Membrane', 20) ('Transferase', 20) ('Metal-binding', 18) ('Hydrolase', 15) ('Transcription', 15) ('Transcription regulation', 14) ('KW Plasmid', 13) ('Oxidoreductase', 12) ('Zinc', 10) ('Zinc-finger', 8) ('Lyase', 8) ('Repeat', 7) ('Reference proteome. Antiviral defense', 6) ('Cytoplasm', 6) ('DNA-binding', 6) ('Magnesium', 6) ('Nuclease', 6) ('KW Transferase', 6) ('Iron', 5) ('Iron-sulfur', 5) ('Rule:MF_01465}', 5) ('Repressor', 4) ('Ligase', 4) ('Rule:MF_00383}', 4) ('Manganese', 3) ('Flavoprotein', 3) ('Endonuclease', 3) ('Kinase', 3) ('Phosphoprotein', 3) ('Coiled coil', 3) ('Glutamine amidotransferase', 3) ('Protease', 3) ('Transposition', 3) ('Histidine metabolism', 3) ('DNA replication', 3) ('Rule:MF_01407}', 3) ('Helicase', 2) ('ProRule:PRU00181}', 2) ('Protein biosynthesis', 2) ('KW Pyridoxal phosphate', 2) ('KW Metal-binding', 2) ('Transposable element', 2) ('NAD', 2) ('KW Nucleotide-binding', 2) ('Branched-chain amino acid biosynthesis', 2) ('Leucine biosynthesis', 2) ('Rule:MF_01027}', 2) ('Reference proteome. Cell membrane', 1) ('FAD', 1) ('ATP-binding', 1) ('Reference proteome. Cell shape', 1) ('GTP-binding', 1) ('Reference proteome. ATP-binding', 1) ('Carbohydrate metabolism', 1) ('Transferase. 4Fe-4S', 1) ('Antiviral defense', 1) ('Exonuclease', 1) ('RNA-binding. Antiviral defense', 1) ('Reference proteome. Membrane', 1) ('DNA recombination', 1) ('Rotamase', 1) ('Rule:MF_00265}', 1) ('Toxin', 1) ('Toxin-antitoxin system', 1) ('Rule:MF_00265}. Membrane', 1) ('ProRule:PRU00469}', 1) ('ProRule:PRU00469}. Plasmid', 1) ('Hydrogen ion transport', 1) ('Ion transport', 1) ('Rule:MF_00028}', 1) ('Glycoprotein', 1) ('S-layer', 1) ('Secreted', 1) ('Signal', 1) ('KW Toxin-antitoxin system', 1) ('Phosphate transport', 1) ('Aminotransferase', 1) ('Histidine biosynthesis', 1) ('KW Zinc', 1) ('ProRule:PRU00703}', 1) ('DNA damage', 1) ('Nucleotidyltransferase', 1) ('Lipid metabolism', 1) ('Cell membrane', 1) ('Protein transport', 1) ('Translocation', 1) ('Methyltransferase', 1) ('Rule:MF_00577}', 1) ('Metalloprotease', 1) ('KW ATP-binding', 1) ('KW Ligase', 1) ('KW Protein biosynthesis', 1) ('Rule:MF_00229}', 1) ('FMN', 1) ('Cobalamin biosynthesis', 1) ('Thiamine pyrophosphate', 1) ('Symport', 1) ('KW Chaperone', 1) ('Rule:MF_00372}', 1) ('Translocase', 1) ('ProRule:PRU01248}', 1) ('Rule:MF_00060}', 1) ('Amino-acid biosynthesis', 1) ('Rule:MF_01401}', 1) ('KW Magnesium', 1)
#Halobacterium salinarum = hs
print(*keywords(entries_hs, 'Plasmid pNRC200'), sep='\n')
('Reference proteome', 229) ('Plasmid', 149) ('KW Reference proteome', 86) ('Transmembrane', 51) ('Transmembrane helix', 51) ('Nucleotide-binding', 21) ('Vacuole. Gas vesicle', 18) ('Membrane', 15) ('Transport', 15) ('Metal-binding', 9) ('DNA-binding', 9) ('Repeat', 8) ('Transcription', 8) ('Vacuole. ATP-binding', 7) ('Cytoplasm', 6) ('Potassium', 6) ('Potassium transport', 6) ('DNA replication', 6) ('KW Plasmid', 6) ('ProRule:PRU00434}', 6) ('Gas vesicle', 5) ('Zinc', 5) ('Transferase', 5) ('ATP-binding', 4) ('Magnesium', 4) ('Transcription regulation', 4) ('Transposition', 4) ('Hydrolase', 4) ('NAD', 4) ('Reference proteome. Gas vesicle', 3) ('Cell membrane', 3) ('Ion transport', 3) ('Iron', 3) ('Oxidoreductase', 3) ('Coiled coil', 3) ('Rule:MF_01946}', 3) ('Ligase', 2) ('Reference proteome. Coiled coil', 2) ('Phosphoprotein', 2) ('Pyrimidine biosynthesis', 2) ('Reference proteome. ATP-binding', 2) ('KW Transcription', 2) ('KW Transcription regulation', 2) ('Helicase', 2) ('Phosphate transport', 2) ('S-adenosyl-L-methionine', 2) ('Electron transport', 2) ('Protease', 2) ('Cell division', 2) ('ProRule:PRU01248}', 2) ('Zinc. Aminoacyl-tRNA synthetase', 1) ('Protein biosynthesis', 1) ('Translocase', 1) ('Transport. Cell membrane', 1) ('Transport. Gas vesicle', 1) ('Vacuole. Heme', 1) ('Hydrogen peroxide', 1) ('Peroxidase', 1) ('Transport. ATP-binding', 1) ('Reference proteome. Direct protein sequencing', 1) ('Vacuole. Arginine metabolism', 1) ('Direct protein sequencing', 1) ('Transferase. DNA recombination', 1) ('Transposable element', 1) ('Transposition. Plasmid', 1) ('Transferase. DNA-binding', 1) ('Transcription regulation. Metal-binding', 1) ('Zinc. Metal-binding', 1) ('Zinc-finger. Plasmid', 1) ('Reference proteome. Metal-binding', 1) ('Zinc-finger. DNA-binding', 1) ('Transcription regulation. DNA-binding', 1) ('Transcription regulation. ATP-binding', 1) ('Reference proteome. Cytoplasm', 1) ('Flavoprotein', 1) ('FMN', 1) ('Isomerase', 1) ('Isoprene biosynthesis', 1) ('NADP', 1) ('Reference proteome. Amino-acid transport', 1) ('Antiport', 1) ('Transport. Arginine metabolism', 1) ('Kinase', 1) ('Transferase. ATP-binding', 1) ('Reference proteome. Plasmid', 1) ('Symport', 1) ('ProRule:PRU00703}', 1) ('KW Pyridoxal phosphate', 1) ('KW Transferase', 1) ('Zinc-finger', 1) ('Methyltransferase', 1) ('Heme', 1) ('Stress response', 1) ('Cell shape', 1) ('GTP-binding', 1) ('DNA-directed DNA polymerase', 1) ('Nucleotidyltransferase', 1) ('Iron-sulfur', 1) ('Lyase', 1) ('Rule:MF_00917}. Plasmid', 1) ('Rule:MF_00265}', 1) ('Nuclease', 1) ('Toxin', 1) ('Toxin-antitoxin system', 1) ('Rule:MF_00265}. Plasmid', 1) ('DNA damage', 1) ('ProRule:PRU00169}', 1) ('Pyridoxal phosphate', 1)