Tiezheng Yuan Ph.D.: Perl: Download genome annotations from NCBI FTP site

Abstract: Download genome sequences including DNA, RNA and protein sequences as FASTA or annotation in GFF format.

The codes of main script:

###extract annotations from NCBI

#my @species=qw/human mouse rat cattle dog chicken maize zebrafish soybean rice/;

my @species=qw/rice/;

foreach my $specie(@species){

sub_bioperl::download_NCBI_genome('DNA_fa', $specie, '/home/yuan/data_preparation/');

sub_bioperl::download_NCBI_genome('RNA_fa', $specie, '/home/yuan/data_preparation/');

sub_bioperl::download_NCBI_genome('RNA_gbk', $specie, '/home/yuan/data_preparation/');

sub_bioperl::download_NCBI_genome('protein_fa', $specie, '/home/yuan/data_preparation/');

sub_bioperl::download_NCBI_genome('protein_gbk', $specie, '/home/yuan/data_preparation/');

sub_bioperl::download_NCBI_genome('annot_gff', $specie, '/home/yuan/data_preparation/');

}

The codes of the function download_NCBI_genome:

######

sub download_NCBI_genome{

my($genome_type, $specie, $out_dir)=@_;

my %species=('human'=>'H_sapiens', 'mouse'=>'Mus_musculus', 'rat'=>'Rattus_norvegicus',

'cattle'=>'Bos_taurus', 'dog'=>'Canis_familiaris', 'chicken'=>'Gallus_gallus', 'zebrafish'=>'Danio_rerio',

'soyben'=>'Glycine_max', 'maize'=>'Zea_mays', 'rice'=>'Oryza_sativa_Japonica_Group');

my $url_specie=($species{$specie}) ? $species{$specie} : $species{'human'};

my $db_url='ftp://ftp.ncbi.nlm.nih.gov/genomes/'.$url_specie.'/';

$out_dir=Cwd::getcwd() unless $out_dir;

$out_dir=sub_common::format_directory($out_dir.$specie);

my @local_files;

if($genome_type eq 'DNA_fa'){

my $url = $db_url.'Assembled_chromosomes/seq/';

my $web_file_names_pointer=sub_common::web_list_files($url, "file_names", "_ref_");

my @web_file_names=@$web_file_names_pointer;

@web_file_names=grep(/\.fa\./, @web_file_names);

@web_file_names=grep(/chr/, @web_file_names);

#print join("\n", @web_file_names);

#download files

foreach my $web_file_name(@web_file_names){

my $web_file=$url.$web_file_name;

my $saved_file=sub_common::web_download_file($web_file, $out_dir);

push(@local_files, $saved_file);

}

elsif($genome_type eq 'RNA_fa'){

my $web_file = $db_url.'RNA/rna.fa.gz';

my $saved_file=sub_common::web_download_file($web_file, $out_dir);

push(@local_files, $saved_file);

}

elsif($genome_type eq 'RNA_gbk'){

my $web_file = $db_url.'RNA/rna.gbk.gz';

my $saved_file=sub_common::web_download_file($web_file, $out_dir);

push(@local_files, $saved_file);

}

elsif($genome_type eq 'protein_fa'){

my $web_file = $db_url.'protein/protein.fa.gz';

my $saved_file=sub_common::web_download_file($web_file, $out_dir);

push(@local_files, $saved_file);

}

elsif($genome_type eq 'protein_gbk'){

my $web_file = $db_url.'protein/protein.gbk.gz';

my $saved_file=sub_common::web_download_file($web_file, $out_dir);

push(@local_files, $saved_file);

}

elsif($genome_type eq 'annot_gff'){

#print $db_url.'GFF/', "\n";

my $web_file=sub_common::web_list_files($db_url.'GFF/', 'file', 'top_level');

print "$web_file\n";

my $saved_file=sub_common::web_download_file($web_file, $out_dir);

push(@local_files, $saved_file);

}

else{

print "Error: No files downloaded\n";

}

#print join("\n", @local_files, "\n");

return(\@local_files);

}

The codes of dependent functions:

###################3

sub web_download_file{

my($file_url, $out_dir, $unpack)=@_;

$file_url=~s/\/$//;

$out_dir=format_directory($out_dir);

$unpack=1 unless $unpack; #default is to uncompress the file;

#download file

my $ff = File::Fetch->new(uri => $file_url);

$ff->fetch(to=>$out_dir) or die $ff->error;

#return local file

my $gz_file=$out_dir.file_operation($file_url, 'file_name');

#uncompress gz file

my $local_file=$gz_file;

$local_file=~s/\.gz$//;

system("rm $local_file") if -f $local_file;

system("gunzip $gz_file");

printf( "Download a file from %s and save it into %s\n", $file_url, $local_file);

return($local_file);

}

Tiezheng Yuan Ph.D.

Wednesday, May 18, 2016

Perl: Download genome annotations from NCBI FTP site

No comments:

Post a Comment