Thursday, March 31, 2016

Perl: Storage of huge hash (2)


Abstract: Load huge nested hash into disk using the DBM::Deep module.



Example : a text file with ~98MB size and ~0.8 million lines. The file was loaded into a nest hash with million key/value pairs, which use about 161MB RAM. If the file was loaded into a DBM file tied with a hash, the tied hash used 5kB RAM memory room, and the DBM file occupy ~430MB disk. But it costs a lot of time when producing the DBM file. Once DBM file is produced, loading the hash tied with the DBM file is very fast.

Here is the Perl code:
use warnings;
use strict;
use DBM::Deep;
use Devel::Size qw(size total_size);


sub flatfile_to_hash2{
 my($infile)=@_;
 
 my $n=0;
 my %hash;
 open my($IN), "<", $infile or die;
 while(<$IN>){
  chomp($_);
  my($key1, $key2, $key3, $key4, $value)=split(',', $_);
  $hash{$key2}->{$key4}=$_;
  $n++;
 }
 print "$n\n";
 return(\%hash);
}
#
sub flatfile_to_DBM_hash2{
 my($infile)=@_;
 my $DBM_file=$infile.'.db';
 my $n=1;
 unless (-f $DBM_file){
  print "Generate $DBM_file\n";
  my $db=DBM::Deep->new($DBM_file);
  open my($IN), "<", $infile or die;
  while(<$IN>){
   chomp($_);
   my($key1, $key2, $key3, $key4, $value)=split(',', $_);
   $db->{$key2}->{$key4}=$_;
   $n++;
   #if ($n % 10000 ==0) {print "$n\n"};
   #print $_;
  }
  $db->begin_work;
  print "$n\n";
 }
 #
 #my $db=tie my %hash,  'DBM::Deep', $DBM_file;
 #untie %hash;
 my $db = DBM::Deep->new(file => $DBM_file,    type => DBM::Deep->TYPE_HASH  );
 return($db);
}

#a empty hash
my %hash;
my $size=Devel::Size::total_size(\%hash);
printf( "Size of an empty hash: %s byte\n",  $size);

print "store a hash into RAM\n";
my $pointer=flatfile_to_hash2('/home/yuan/phip/ref_seq/virus_dependent_peptides.csv');
%hash=%$pointer;
my $num=keys %hash;
$size=int(Devel::Size::total_size(\%hash)/1024/1024);
printf( "Size of a hash with %s keys: %s MB\n", $num, $size);


print "store a hash into DBM file\n";
my $db=flatfile_to_DBM_hash2('/home/yuan/phip/ref_seq/virus_dependent_peptides.csv');
$num=keys %$db;
$size=int(Devel::Size::total_size(\%$db)/1024);
printf( "Size of a DBM hash with %s keys: %s KB\n", $num, $size);
#foreach my $a(keys %$db){
# my $p=$db->{$a};
# foreach my $b(keys %$p){
#  printf("%s\t%s\t%s\n", $a, $b, $p->{$b});
# }
#}

print "ok\n";

No comments:

Post a Comment