From: Shawn Boyette Date: Wed, 10 Dec 2008 22:39:16 +0000 (+0000) Subject: compress: huge overhaul, taking advantage of the reverse-sort input requirement X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=commitdiff_plain;h=824ba12f0e3886c1329b4b90d70d95db22a15594 compress: huge overhaul, taking advantage of the reverse-sort input requirement --- diff --git a/compress_fingerprints b/compress_fingerprints index 47f5858..a44264f 100755 --- a/compress_fingerprints +++ b/compress_fingerprints @@ -10,7 +10,8 @@ my $conf = {}; # configuration hashref initialize($conf); my %fps = (); # records matching each fingerprint (and the lead) -my %recs = (); # fingerprints belonging to each record +my @recs = (); # fingerprints belonging to each record +my %seen = (); # records we've already seen my $lastscore = 0; # previous fingerprint's score open FP, '<', $ARGV[0] or die "Can't open input file: $!\n"; @@ -47,9 +48,6 @@ sub populate_fingerprint { } $lastscore = $fp{compact}; - # populate records hash - $recs{ $fp{id} }{ $fp{sha1} } = {}; - return \%fp; } @@ -61,28 +59,17 @@ sub rank_fingerprint { my $id = $fp->{id}; # only process records which haven't already been seen - unless (defined $recs{$id}{lead}) { - unless ($fps{$sha1}) { - # haven't seen this fp before. create a new hashref with the current - # record as lead - $fps{$sha1} = { lead => { id => $id, - score => $fp->{compact} }, - recs => [ $id ] }; - $recs{$id}{lead} = 1; + unless (defined $seen{$id}) { + unless (defined $fps{$sha1}) { + # haven't seen this fp before. create a new listref to hold subs + # and stow the hash of the fingerprint that we're lead of + $fps{$sha1} = []; + push @recs, {id => $id, sha1 => $sha1}; } else { # have seen this fp. push record id onto matchlist - push @{ $fps{$sha1}{recs} }, $id; - if ($fp->{compact} > $fps{$sha1}{lead}{score}) { - # and set this record as lead if it scores higher than current lead - $recs{ $fps{$sha1}{lead}{id} }{lead} = 0; # unset current - $recs{ $id }{lead} = 1; # set new as lead - $fps{$sha1}{lead}{id} = $id; - $fps{$sha1}{lead}{score} = $fp->{compact}; - } else { - # otherwise, mark it as a sub so it never gets processed again - $recs{$id}{lead} = 0; - } + push @{ $fps{$sha1} }, $id; } + $seen{$id} = 1; } } @@ -97,17 +84,10 @@ sub dump_records { my %used = (); open OUT, '>', $conf->{output} or die "Can't open ", $conf->{output}, "$!\n"; - for my $id (keys %recs) { - next unless $recs{$id}{lead}; - $used{$id} = 1; - for my $sha1 ( keys %{$recs{$id}} ) { - for my $subid ( @{$fps{$sha1}{recs}} ) { - next if ($id == $subid); - next if defined $used{$subid}; - $used{$subid} = 1; - print OUT "$id\t$subid\n"; - } - } + for my $rec (@recs) { + print $rec,"\n"; + print OUT $rec->{id}, "\t$_\n" + for ( @{ $fps{ $rec->{sha1} } } ); } }