Abstract: Retrieve GO(Gene Ontology) terms from http://geneontology.org/, and save them into a dictionary
The result:
data-version: releases/2015-08-13
Total number of GO terms: 43555
ok
The script:
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 14:52:00 2015
@author: yuan
"""
import re
import os.path
import urllib2
class GO:
def __init__(self, file):
self.obo_file=file
if not os.path.isfile(file):
GO_linkage='http://geneontology.org/ontology/go.obo'
print 'Download GO terms into', file
self.retrieve_web_file(GO_linkage, file)
def retrieve_web_file(self, linkage, file):
##retieve a web file
urlfile=urllib2.urlopen(linkage)
html=urlfile.read()
out_obj=open(file, 'w')
out_obj.write(html)
out_obj.close()
def analyze_GOblock(self, block):
term={}
for line in block:
items=line.split(": ")
#print items
if len(items)>=2:
if items[0]=='is_a':
sub_id, sub_def=items[1].split(" ! ")
if term.has_key('is_a'):
term['is_a'] += ','+sub_id
else:
term['is_a']=sub_id
else:
term[items[0]]=items[1]
#print term['id'], term['name'], term['is_a']
return term
def extract_GO_terms(self):
n=0
block=[]
#keys is GO ids, and value is dict of this term
GO_terms={}
#correlations between GO terms
GO2_dict={} #one term vs multiple terms
GO1_list=[] #one term vs one term
in_obj=open(self.obo_file, 'r')
for line in in_obj:
if line=="\n":
#GO terms block
if n>=1:
GOterm=self.analyze_GOblock(block)
GO_terms[GOterm['id']]=GOterm
if GOterm.has_key('is_a'):
GO2_dict[GOterm['id']]=GOterm['is_a']
for term_id in GOterm['is_a'].split(','):
GO1_list.append((GOterm['id'], term_id))
else:
print block[1]
block=[]
n += 1
#"\n" sepearted
else:
line=line.rstrip("\n")
block.append(line)
in_obj.close()
#print GO2_dict
print 'Total number of GO terms:', n
return GO_terms, GO2_dict, GO1_list
if __name__=="__main__":
file='/home/yuan/mysql_pre/cookbooks/go.obo'
g=GO(file)
g. extract_GO_terms()
print 'ok'
No comments:
Post a Comment