In [285]:
import gzip
#Haloferax volcanii = hv
#Halobacterium salinarum = hs
with gzip.open('UP000000554.swiss.gz', 'rt') as hs:
    entries_hs = ''.join(hs.readlines()).split('\nID')
    ec1_hs = 0
    ec2_hs = 0
    ec3_hs = 0
    ec4_hs = 0
    ec5_hs = 0
    ec6_hs = 0
    ec7_hs = 0
    for entry in entries_hs:
        if entry.find('Plasmid pNRC200') != -1:
            if entry.find('EC=1.') != -1:
                ec1_hs += 1
            if entry.find('EC=2.') != -1:
                ec2_hs += 1
            if entry.find('EC=3.') != -1:
                ec3_hs += 1
            if entry.find('EC=4.') != -1:
                ec4_hs += 1
            if entry.find('EC=5.') != -1:
                ec5_hs += 1
            if entry.find('EC=6.') != -1:
                ec6_hs += 1
            if entry.find('EC=7.') != -1:
                ec7_hs += 1
with gzip.open('UP000008243.swiss.gz', 'rt') as hv:
    entries_hv = ''.join(hv.readlines()).split('\nID')
    ec1_hv = 0
    ec2_hv = 0
    ec3_hv = 0
    ec4_hv = 0
    ec5_hv = 0
    ec6_hv = 0
    ec7_hv = 0
    for entry in entries_hv:
        if entry.find('Plasmid pHV4') != -1:
            if entry.find('EC=1.') != -1:
                ec1_hv += 1
            if entry.find('EC=2.') != -1:
                ec2_hv += 1
            if entry.find('EC=3.') != -1:
                ec3_hv += 1
            if entry.find('EC=4.') != -1:
                ec4_hv += 1
            if entry.find('EC=5.') != -1:
                ec5_hv += 1
            if entry.find('EC=6.') != -1:
                ec6_hv += 1
            if entry.find('EC=7.') != -1:
                ec7_hv += 1 
print(f'oxidoreductases in reference proteome: {ec1_hs}, in H.volcanii: {ec1_hv}')
print(f'transferases in reference proteome: {ec2_hs}, in H.volcanii: {ec2_hv}')
print(f'hydrolases in reference proteome: {ec3_hs}, in H.volcanii: {ec3_hv}')
print(f'hydrolases in reference proteome: {ec4_hs}, in H.volcanii: {ec4_hv}')
print(f'isomerases in reference proteome: {ec5_hs}, in H.volcanii: {ec5_hv}')
print(f'ligases in reference proteome: {ec6_hs}, in H.volcanii: {ec6_hv}')
print(f'translocases in reference proteome: {ec7_hs}, in H.volcanii: {ec7_hv}')
oxidoreductases in reference proteome: 3, in H.volcanii: 20
transferases in reference proteome: 9, in H.volcanii: 23
hydrolases in reference proteome: 5, in H.volcanii: 25
hydrolases in reference proteome: 2, in H.volcanii: 13
isomerases in reference proteome: 1, in H.volcanii: 5
ligases in reference proteome: 3, in H.volcanii: 7
translocases in reference proteome: 1, in H.volcanii: 0
In [311]:
def keywords(entries, location):
    functions = ''
    for entry in entries:
        if entry.find(location) != -1:
            start = entry.find('\nKW')+4
            end1 = entry.find('\nFT')
            end2 = entry.find('\nSQ')
            if start != -1:
                if end1 != -1:
                    function = ';'.join(entry[start:end1].split('KW'))
                if end2 != -1 and end1 == -1:
                    function = ';'.join(entry[start:end2].split('\nSQ'))
            functions += function
    unordered_list_KW = [f.strip() for f in functions.split(';') if f.strip() != '']
    for i in range(len(unordered_list_KW)):
        keyword = unordered_list_KW[i]
        if keyword.find('{') != -1:
            unordered_list_KW[i] = keyword[:keyword.find('{')].strip()
    res = dict()
    for f in unordered_list_KW:
        res[f] = res.get(f, 0) + 1
    return [x for x in sorted(res.items(), key=lambda x:x[1], reverse=True) if x[0][:3] !='ECO']
In [312]:
#Haloferax volcanii = hv
print(*keywords(entries_hv, 'Plasmid pHV4'), sep='\n')
('Reference proteome', 440)
('Plasmid', 283)
('KW   Reference proteome', 120)
('Transmembrane', 103)
('Transmembrane helix', 103)
('ProRule:PRU00434}', 36)
('Nucleotide-binding', 31)
('Transport', 25)
('Membrane', 20)
('Transferase', 20)
('Metal-binding', 18)
('Hydrolase', 15)
('Transcription', 15)
('Transcription regulation', 14)
('KW   Plasmid', 13)
('Oxidoreductase', 12)
('Zinc', 10)
('Zinc-finger', 8)
('Lyase', 8)
('Repeat', 7)
('Reference proteome.  Antiviral defense', 6)
('Cytoplasm', 6)
('DNA-binding', 6)
('Magnesium', 6)
('Nuclease', 6)
('KW   Transferase', 6)
('Iron', 5)
('Iron-sulfur', 5)
('Rule:MF_01465}', 5)
('Repressor', 4)
('Ligase', 4)
('Rule:MF_00383}', 4)
('Manganese', 3)
('Flavoprotein', 3)
('Endonuclease', 3)
('Kinase', 3)
('Phosphoprotein', 3)
('Coiled coil', 3)
('Glutamine amidotransferase', 3)
('Protease', 3)
('Transposition', 3)
('Histidine metabolism', 3)
('DNA replication', 3)
('Rule:MF_01407}', 3)
('Helicase', 2)
('ProRule:PRU00181}', 2)
('Protein biosynthesis', 2)
('KW   Pyridoxal phosphate', 2)
('KW   Metal-binding', 2)
('Transposable element', 2)
('NAD', 2)
('KW   Nucleotide-binding', 2)
('Branched-chain amino acid biosynthesis', 2)
('Leucine biosynthesis', 2)
('Rule:MF_01027}', 2)
('Reference proteome.  Cell membrane', 1)
('FAD', 1)
('ATP-binding', 1)
('Reference proteome.  Cell shape', 1)
('GTP-binding', 1)
('Reference proteome.  ATP-binding', 1)
('Carbohydrate metabolism', 1)
('Transferase.  4Fe-4S', 1)
('Antiviral defense', 1)
('Exonuclease', 1)
('RNA-binding.  Antiviral defense', 1)
('Reference proteome.  Membrane', 1)
('DNA recombination', 1)
('Rotamase', 1)
('Rule:MF_00265}', 1)
('Toxin', 1)
('Toxin-antitoxin system', 1)
('Rule:MF_00265}.  Membrane', 1)
('ProRule:PRU00469}', 1)
('ProRule:PRU00469}.  Plasmid', 1)
('Hydrogen ion transport', 1)
('Ion transport', 1)
('Rule:MF_00028}', 1)
('Glycoprotein', 1)
('S-layer', 1)
('Secreted', 1)
('Signal', 1)
('KW   Toxin-antitoxin system', 1)
('Phosphate transport', 1)
('Aminotransferase', 1)
('Histidine biosynthesis', 1)
('KW   Zinc', 1)
('ProRule:PRU00703}', 1)
('DNA damage', 1)
('Nucleotidyltransferase', 1)
('Lipid metabolism', 1)
('Cell membrane', 1)
('Protein transport', 1)
('Translocation', 1)
('Methyltransferase', 1)
('Rule:MF_00577}', 1)
('Metalloprotease', 1)
('KW   ATP-binding', 1)
('KW   Ligase', 1)
('KW   Protein biosynthesis', 1)
('Rule:MF_00229}', 1)
('FMN', 1)
('Cobalamin biosynthesis', 1)
('Thiamine pyrophosphate', 1)
('Symport', 1)
('KW   Chaperone', 1)
('Rule:MF_00372}', 1)
('Translocase', 1)
('ProRule:PRU01248}', 1)
('Rule:MF_00060}', 1)
('Amino-acid biosynthesis', 1)
('Rule:MF_01401}', 1)
('KW   Magnesium', 1)
In [313]:
#Halobacterium salinarum = hs
print(*keywords(entries_hs, 'Plasmid pNRC200'), sep='\n')
('Reference proteome', 229)
('Plasmid', 149)
('KW   Reference proteome', 86)
('Transmembrane', 51)
('Transmembrane helix', 51)
('Nucleotide-binding', 21)
('Vacuole.  Gas vesicle', 18)
('Membrane', 15)
('Transport', 15)
('Metal-binding', 9)
('DNA-binding', 9)
('Repeat', 8)
('Transcription', 8)
('Vacuole.  ATP-binding', 7)
('Cytoplasm', 6)
('Potassium', 6)
('Potassium transport', 6)
('DNA replication', 6)
('KW   Plasmid', 6)
('ProRule:PRU00434}', 6)
('Gas vesicle', 5)
('Zinc', 5)
('Transferase', 5)
('ATP-binding', 4)
('Magnesium', 4)
('Transcription regulation', 4)
('Transposition', 4)
('Hydrolase', 4)
('NAD', 4)
('Reference proteome.  Gas vesicle', 3)
('Cell membrane', 3)
('Ion transport', 3)
('Iron', 3)
('Oxidoreductase', 3)
('Coiled coil', 3)
('Rule:MF_01946}', 3)
('Ligase', 2)
('Reference proteome.  Coiled coil', 2)
('Phosphoprotein', 2)
('Pyrimidine biosynthesis', 2)
('Reference proteome.  ATP-binding', 2)
('KW   Transcription', 2)
('KW   Transcription regulation', 2)
('Helicase', 2)
('Phosphate transport', 2)
('S-adenosyl-L-methionine', 2)
('Electron transport', 2)
('Protease', 2)
('Cell division', 2)
('ProRule:PRU01248}', 2)
('Zinc.  Aminoacyl-tRNA synthetase', 1)
('Protein biosynthesis', 1)
('Translocase', 1)
('Transport.  Cell membrane', 1)
('Transport.  Gas vesicle', 1)
('Vacuole.  Heme', 1)
('Hydrogen peroxide', 1)
('Peroxidase', 1)
('Transport.  ATP-binding', 1)
('Reference proteome.  Direct protein sequencing', 1)
('Vacuole.  Arginine metabolism', 1)
('Direct protein sequencing', 1)
('Transferase.  DNA recombination', 1)
('Transposable element', 1)
('Transposition.  Plasmid', 1)
('Transferase.  DNA-binding', 1)
('Transcription regulation.  Metal-binding', 1)
('Zinc.  Metal-binding', 1)
('Zinc-finger.  Plasmid', 1)
('Reference proteome.  Metal-binding', 1)
('Zinc-finger.  DNA-binding', 1)
('Transcription regulation.  DNA-binding', 1)
('Transcription regulation.  ATP-binding', 1)
('Reference proteome.  Cytoplasm', 1)
('Flavoprotein', 1)
('FMN', 1)
('Isomerase', 1)
('Isoprene biosynthesis', 1)
('NADP', 1)
('Reference proteome.  Amino-acid transport', 1)
('Antiport', 1)
('Transport.  Arginine metabolism', 1)
('Kinase', 1)
('Transferase.  ATP-binding', 1)
('Reference proteome.  Plasmid', 1)
('Symport', 1)
('ProRule:PRU00703}', 1)
('KW   Pyridoxal phosphate', 1)
('KW   Transferase', 1)
('Zinc-finger', 1)
('Methyltransferase', 1)
('Heme', 1)
('Stress response', 1)
('Cell shape', 1)
('GTP-binding', 1)
('DNA-directed DNA polymerase', 1)
('Nucleotidyltransferase', 1)
('Iron-sulfur', 1)
('Lyase', 1)
('Rule:MF_00917}.  Plasmid', 1)
('Rule:MF_00265}', 1)
('Nuclease', 1)
('Toxin', 1)
('Toxin-antitoxin system', 1)
('Rule:MF_00265}.  Plasmid', 1)
('DNA damage', 1)
('ProRule:PRU00169}', 1)
('Pyridoxal phosphate', 1)
In [ ]:
 
In [ ]: