Abstract: Load huge nested hash into disk using the DBM::Deep module. Example : a text file with ~98MB size and ~0.8 million lines. The file was loaded into a nest hash with million key/value pairs, which use about 161MB RAM. If the file was loaded into a DBM file tied with a hash, the tied hash used 5kB RAM memory room, and the DBM file occupy ~430MB disk. But it costs a lot of time when producing the DBM file. Once DBM file is produced, loading the hash tied with the DBM file is very fast. Here is the Perl code: use warnings; use strict; use DBM::Deep; use Devel::Size qw(size total_size); sub flatfile_to_hash2{ my($infile)=@_; my $n=0; my %hash; open my($IN), "<", $infile or die; while(<$IN>){ chomp($_); my($key1, $key2, $key3, $key4, $value)=split(',', $_); $hash{$key2}->{$key4}=$_; $n++; } print "$n\n"; return(\%hash); } # sub flatfile_to_DBM_hash2{ my($infile)=@_; my $DBM_file=$infile.'.db'; my $n=1; unless (-f $DBM_file){ print "Generate $DBM_file\n"; my $db=DBM::Deep->new($DBM_file); open my($IN), "<", $infile or die; while(<$IN>){ chomp($_); my($key1, $key2, $key3, $key4, $value)=split(',', $_); $db->{$key2}->{$key4}=$_; $n++; #if ($n % 10000 ==0) {print "$n\n"}; #print $_; } $db->begin_work; print "$n\n"; } # #my $db=tie my %hash, 'DBM::Deep', $DBM_file; #untie %hash; my $db = DBM::Deep->new(file => $DBM_file, type => DBM::Deep->TYPE_HASH ); return($db); } #a empty hash my %hash; my $size=Devel::Size::total_size(\%hash); printf( "Size of an empty hash: %s byte\n", $size); print "store a hash into RAM\n"; my $pointer=flatfile_to_hash2('/home/yuan/phip/ref_seq/virus_dependent_peptides.csv'); %hash=%$pointer; my $num=keys %hash; $size=int(Devel::Size::total_size(\%hash)/1024/1024); printf( "Size of a hash with %s keys: %s MB\n", $num, $size); print "store a hash into DBM file\n"; my $db=flatfile_to_DBM_hash2('/home/yuan/phip/ref_seq/virus_dependent_peptides.csv'); $num=keys %$db; $size=int(Devel::Size::total_size(\%$db)/1024); printf( "Size of a DBM hash with %s keys: %s KB\n", $num, $size); #foreach my $a(keys %$db){ # my $p=$db->{$a}; # foreach my $b(keys %$p){ # printf("%s\t%s\t%s\n", $a, $b, $p->{$b}); # } #} print "ok\n";
Thursday, March 31, 2016
Perl: Storage of huge hash (2)
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment