Abstract: retrieve GO annotations with a given specie from ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/
We will store the relationships between genes(DB_id) and GO terms(GO_id) into a dictionary.
Here is the introductions of the goa files:
Column No. Contents
1. DB
2. DB_Object_ID
3. DB_Object_Symbol
4. Qualifier
5. GO_ID
6. DB:Reference
7. Evidence Code
8. With (or) From
9. Aspect
10. DB_Object_Name
11. DB_Object_Synonym
12. DB_Object_Type
13. Taxon and Interacting taxon
14. Date
15. Assigned_By
16. Annotation_Extension
17. Gene_Product_Form_ID
The result:
Number of genes with GO annotations: 47355
ok
The script:
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 16:28:44 2015
@author: yuan
"""
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 14:52:00 2015
@author: yuan
"""
import re
import os.path
import urllib
import gzip
class GO:
def __init__(self, specie, out_dir):
file_name='gene_association.goa_'+specie.lower()+'.gz'
self.file=out_dir+file_name
if not os.path.isfile(self.file):
linkage='ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/'+specie+'/'+file_name
print 'Web linkage:', linkage
#download file
urllib.urlretrieve(linkage, self.file)
print 'Download GO terms into', self.file
def GO_annotation(self):
if self.file.rfind('.gz')>0:
in_obj=gzip.open(self.file, 'r')
else:
in_obj=open(self.file, 'r')
#
n=0
GOannot={}
for line in in_obj:
line=line.rstrip("\n")
if not line.find("!")==0:
items=line.split("\t")
DB_id=items[1]
GO_id=items[4]
if GOannot.has_key(DB_id):
GOannot[DB_id] += ','+GO_id
else:
GOannot[DB_id]=GO_id
n+=1
in_obj.close()
print 'Number of genes with GO annotations:',n
return GOannot
if __name__=="__main__":
#other species: CHICHEN, COW, DICTY, DOG, FLY, MOUSE, PIG, RAT, YEAST, ZEBRAFISH
specie='HUMAN'
out_dir='/home/yuan/mysql_pre/cookbooks/'
g=GO(specie, out_dir)
GOannot=g.GO_annotation()
print 'ok'
No comments:
Post a Comment