Tiezheng Yuan Ph.D.: Bioinformatics in Python: retrieve GO terms

Abstract: Retrieve GO(Gene Ontology) terms from http://geneontology.org/, and save them into a dictionary

The result:
data-version: releases/2015-08-13
Total number of GO terms: 43555
ok

The script:
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 14:52:00 2015

@author: yuan
"""
import re
import os.path
import urllib2

class GO:
    def __init__(self, file):
        self.obo_file=file
        if not os.path.isfile(file):
            GO_linkage='http://geneontology.org/ontology/go.obo'
            print 'Download GO terms into', file
            self.retrieve_web_file(GO_linkage, file)

    def retrieve_web_file(self, linkage, file):
        ##retieve a web file
        urlfile=urllib2.urlopen(linkage)
        html=urlfile.read()
        out_obj=open(file, 'w')
        out_obj.write(html)
        out_obj.close()

    def analyze_GOblock(self, block):
        term={}
        for line in block:
            items=line.split(": ")
            #print items
            if len(items)>=2:
                if items[0]=='is_a':
                    sub_id, sub_def=items[1].split(" ! ")
                    if term.has_key('is_a'):
                        term['is_a'] += ','+sub_id
                    else:
                        term['is_a']=sub_id
                else:
                    term[items[0]]=items[1]
        #print term['id'], term['name'], term['is_a']
        return term

    def extract_GO_terms(self):
        n=0
        block=[]
        #keys is GO ids, and value is dict of this term
        GO_terms={}
        #correlations between GO terms
        GO2_dict={} #one term vs multiple terms
        GO1_list=[] #one term vs one term
        in_obj=open(self.obo_file, 'r')
        for line in in_obj:
            if line=="\n":
                #GO terms block
                if n>=1:
                    GOterm=self.analyze_GOblock(block)
                    GO_terms[GOterm['id']]=GOterm
                    if GOterm.has_key('is_a'):
                        GO2_dict[GOterm['id']]=GOterm['is_a']
                        for term_id in GOterm['is_a'].split(','):
                            GO1_list.append((GOterm['id'], term_id))
                else:
                    print block[1]
                block=[]
                n += 1
            #"\n" sepearted
            else:
                line=line.rstrip("\n")
                block.append(line)
        in_obj.close()
        #print GO2_dict
        print 'Total number of GO terms:', n
        return GO_terms, GO2_dict, GO1_list

if __name__=="__main__":
    file='/home/yuan/mysql_pre/cookbooks/go.obo'
    g=GO(file)
    g. extract_GO_terms()

    print 'ok'

Tiezheng Yuan Ph.D.

Thursday, August 13, 2015

Bioinformatics in Python: retrieve GO terms

No comments:

Post a Comment