Abstract:
Download genome sequences including DNA, RNA and protein sequences as
FASTA or annotation in GFF format.
The codes of main
script:
###extract
annotations from NCBI
#my
@species=qw/human mouse rat cattle dog chicken maize zebrafish
soybean rice/;
my
@species=qw/rice/;
foreach
my $specie(@species){
sub_bioperl::download_NCBI_genome('DNA_fa',
$specie, '/home/yuan/data_preparation/');
sub_bioperl::download_NCBI_genome('RNA_fa',
$specie, '/home/yuan/data_preparation/');
sub_bioperl::download_NCBI_genome('RNA_gbk',
$specie, '/home/yuan/data_preparation/');
sub_bioperl::download_NCBI_genome('protein_fa',
$specie, '/home/yuan/data_preparation/');
sub_bioperl::download_NCBI_genome('protein_gbk',
$specie, '/home/yuan/data_preparation/');
sub_bioperl::download_NCBI_genome('annot_gff',
$specie, '/home/yuan/data_preparation/');
}
The codes of the
function download_NCBI_genome:
######
sub
download_NCBI_genome{
my($genome_type,
$specie, $out_dir)=@_;
my
%species=('human'=>'H_sapiens', 'mouse'=>'Mus_musculus',
'rat'=>'Rattus_norvegicus',
'cattle'=>'Bos_taurus',
'dog'=>'Canis_familiaris', 'chicken'=>'Gallus_gallus',
'zebrafish'=>'Danio_rerio',
'soyben'=>'Glycine_max',
'maize'=>'Zea_mays', 'rice'=>'Oryza_sativa_Japonica_Group');
my
$url_specie=($species{$specie}) ? $species{$specie} :
$species{'human'};
my
$db_url='ftp://ftp.ncbi.nlm.nih.gov/genomes/'.$url_specie.'/';
$out_dir=Cwd::getcwd()
unless $out_dir;
$out_dir=sub_common::format_directory($out_dir.$specie);
#
my
@local_files;
if($genome_type
eq 'DNA_fa'){
my
$url = $db_url.'Assembled_chromosomes/seq/';
my
$web_file_names_pointer=sub_common::web_list_files($url,
"file_names", "_ref_");
my
@web_file_names=@$web_file_names_pointer;
@web_file_names=grep(/\.fa\./,
@web_file_names);
@web_file_names=grep(/chr/,
@web_file_names);
#print
join("\n", @web_file_names);
#download
files
foreach
my $web_file_name(@web_file_names){
my
$web_file=$url.$web_file_name;
my
$saved_file=sub_common::web_download_file($web_file, $out_dir);
push(@local_files,
$saved_file);
}
}
elsif($genome_type
eq 'RNA_fa'){
my
$web_file = $db_url.'RNA/rna.fa.gz';
my
$saved_file=sub_common::web_download_file($web_file, $out_dir);
push(@local_files,
$saved_file);
}
elsif($genome_type
eq 'RNA_gbk'){
my
$web_file = $db_url.'RNA/rna.gbk.gz';
my
$saved_file=sub_common::web_download_file($web_file, $out_dir);
push(@local_files,
$saved_file);
}
elsif($genome_type
eq 'protein_fa'){
my
$web_file = $db_url.'protein/protein.fa.gz';
my
$saved_file=sub_common::web_download_file($web_file, $out_dir);
push(@local_files,
$saved_file);
}
elsif($genome_type
eq 'protein_gbk'){
my
$web_file = $db_url.'protein/protein.gbk.gz';
my
$saved_file=sub_common::web_download_file($web_file, $out_dir);
push(@local_files,
$saved_file);
}
elsif($genome_type
eq 'annot_gff'){
#print
$db_url.'GFF/', "\n";
my
$web_file=sub_common::web_list_files($db_url.'GFF/', 'file',
'top_level');
print
"$web_file\n";
my
$saved_file=sub_common::web_download_file($web_file, $out_dir);
push(@local_files,
$saved_file);
}
else{
print
"Error: No files downloaded\n";
}
#print
join("\n", @local_files, "\n");
return(\@local_files);
}
The codes of
dependent functions:
###################3
sub
web_download_file{
my($file_url,
$out_dir, $unpack)=@_;
$file_url=~s/\/$//;
$out_dir=format_directory($out_dir);
$unpack=1
unless $unpack; #default is to uncompress the file;
#download
file
my
$ff = File::Fetch->new(uri => $file_url);
$ff->fetch(to=>$out_dir)
or die $ff->error;
#return
local file
my
$gz_file=$out_dir.file_operation($file_url, 'file_name');
#uncompress
gz file
my
$local_file=$gz_file;
$local_file=~s/\.gz$//;
system("rm
$local_file") if -f $local_file;
system("gunzip
$gz_file");
printf(
"Download a file from %s and save it into %s\n", $file_url,
$local_file);
return($local_file);
}
No comments:
Post a Comment