Wednesday, May 18, 2016

Perl: Download genome annotations from NCBI FTP site


Abstract: Download genome sequences including DNA, RNA and protein sequences as FASTA or annotation in GFF format.


The codes of main script:
###extract annotations from NCBI
#my @species=qw/human mouse rat cattle dog chicken maize zebrafish soybean rice/;
my @species=qw/rice/;
foreach my $specie(@species){
sub_bioperl::download_NCBI_genome('DNA_fa', $specie, '/home/yuan/data_preparation/');
sub_bioperl::download_NCBI_genome('RNA_fa', $specie, '/home/yuan/data_preparation/');
sub_bioperl::download_NCBI_genome('RNA_gbk', $specie, '/home/yuan/data_preparation/');
sub_bioperl::download_NCBI_genome('protein_fa', $specie, '/home/yuan/data_preparation/');
sub_bioperl::download_NCBI_genome('protein_gbk', $specie, '/home/yuan/data_preparation/');
sub_bioperl::download_NCBI_genome('annot_gff', $specie, '/home/yuan/data_preparation/');
}


The codes of the function download_NCBI_genome:
######
sub download_NCBI_genome{
my($genome_type, $specie, $out_dir)=@_;
my %species=('human'=>'H_sapiens', 'mouse'=>'Mus_musculus', 'rat'=>'Rattus_norvegicus',
'cattle'=>'Bos_taurus', 'dog'=>'Canis_familiaris', 'chicken'=>'Gallus_gallus', 'zebrafish'=>'Danio_rerio',
'soyben'=>'Glycine_max', 'maize'=>'Zea_mays', 'rice'=>'Oryza_sativa_Japonica_Group');
my $url_specie=($species{$specie}) ? $species{$specie} : $species{'human'};
my $db_url='ftp://ftp.ncbi.nlm.nih.gov/genomes/'.$url_specie.'/';
$out_dir=Cwd::getcwd() unless $out_dir;
$out_dir=sub_common::format_directory($out_dir.$specie);
#
my @local_files;
if($genome_type eq 'DNA_fa'){
my $url = $db_url.'Assembled_chromosomes/seq/';
my $web_file_names_pointer=sub_common::web_list_files($url, "file_names", "_ref_");
my @web_file_names=@$web_file_names_pointer;
@web_file_names=grep(/\.fa\./, @web_file_names);
@web_file_names=grep(/chr/, @web_file_names);
#print join("\n", @web_file_names);
#download files
foreach my $web_file_name(@web_file_names){
my $web_file=$url.$web_file_name;
my $saved_file=sub_common::web_download_file($web_file, $out_dir);
push(@local_files, $saved_file);
}
}
elsif($genome_type eq 'RNA_fa'){
my $web_file = $db_url.'RNA/rna.fa.gz';
my $saved_file=sub_common::web_download_file($web_file, $out_dir);
push(@local_files, $saved_file);
}
elsif($genome_type eq 'RNA_gbk'){
my $web_file = $db_url.'RNA/rna.gbk.gz';
my $saved_file=sub_common::web_download_file($web_file, $out_dir);
push(@local_files, $saved_file);
}
elsif($genome_type eq 'protein_fa'){
my $web_file = $db_url.'protein/protein.fa.gz';
my $saved_file=sub_common::web_download_file($web_file, $out_dir);
push(@local_files, $saved_file);
}
elsif($genome_type eq 'protein_gbk'){
my $web_file = $db_url.'protein/protein.gbk.gz';
my $saved_file=sub_common::web_download_file($web_file, $out_dir);
push(@local_files, $saved_file);
}
elsif($genome_type eq 'annot_gff'){
#print $db_url.'GFF/', "\n";
my $web_file=sub_common::web_list_files($db_url.'GFF/', 'file', 'top_level');
print "$web_file\n";
my $saved_file=sub_common::web_download_file($web_file, $out_dir);
push(@local_files, $saved_file);
}
else{
print "Error: No files downloaded\n";
}
#print join("\n", @local_files, "\n");
return(\@local_files);
}

The codes of dependent functions:

###################3
sub web_download_file{
my($file_url, $out_dir, $unpack)=@_;
$file_url=~s/\/$//;
$out_dir=format_directory($out_dir);
$unpack=1 unless $unpack; #default is to uncompress the file;
#download file
my $ff = File::Fetch->new(uri => $file_url);
$ff->fetch(to=>$out_dir) or die $ff->error;
#return local file
my $gz_file=$out_dir.file_operation($file_url, 'file_name');
#uncompress gz file
my $local_file=$gz_file;
$local_file=~s/\.gz$//;
system("rm $local_file") if -f $local_file;
system("gunzip $gz_file");
printf( "Download a file from %s and save it into %s\n", $file_url, $local_file);
return($local_file);
}

No comments:

Post a Comment