6 use Digest::SHA1 qw(sha1_base64);
9 my $conf = {}; # configuration hashref
12 my %fps = (); # records matching each fingerprint (and the lead)
13 my %recs = (); # fingerprints belonging to each record
15 open FP, '<', $ARGV[0];
17 my @fields = split "\t", $_;
18 my $fp = populate_fingerprint(@fields);
19 rank_fingerprint($fp);
25 sub populate_fingerprint {
30 $fp{compact} = shift @fields;
31 $fp{json} = shift @fields;
32 $fp{id} = shift @fields;
33 $fp{sha1} = sha1_base64( join('', @fields) );
35 # populate records hash
36 $recs{ $fp{id} }{ $fp{sha1} } = { exist => 1 };
42 sub rank_fingerprint {
45 my $sha1 = $fp->{sha1};
47 unless ($fps{$sha1}) {
48 # haven't seen this fp before. create a new hashref with the current
50 $fps{$sha1} = { lead => { id => $id,
51 score => $fp->{compact} },
53 $recs{$id}{$sha1}{lead} = 1;
55 # have seen this fp. push record id onto matchlist
56 push @{ $fps{$sha1}{recs} }, $id;
57 # and set this record as lead if it scores higher than current lead
58 if ($fp->{compact} > $fps{$sha1}{lead}{score}) {
59 $recs{ $fps{$sha1}{lead}{id} }{$sha1}{lead} = 0;
60 $recs{ $id }{$sha1}{lead} = 1;
61 $fps{$sha1}{lead}{id} = $id;
62 $fps{$sha1}{lead}{score} = $fp->{compact};
68 sub dump_fingerprints {
69 open OUT, '>', $conf->{output}
70 or die "Can't open ", $conf->{output}, "$!\n";
71 for my $id (keys %recs) {
72 for my $sha1 ( keys %{$recs{$id}} ) {
73 next unless $recs{$id}{$sha1}{lead};
74 for my $subid ( @{$fps{$sha1}{recs}} )
75 { print OUT "$id\t$subid\n" }
84 # set mode on existing filehandles
85 binmode(STDIN, ':utf8');
87 my $rc = GetOptions( $c,
91 show_help() unless $rc;
92 show_help() if ($c->{help});
94 my @keys = keys %{$c};
95 show_help() unless (@ARGV and @keys);
96 for my $key ('output')
97 { push @missing, $key unless $c->{$key} }
99 print "Required option: ", join(', ', @missing), " missing!\n";
106 Usage is: compress_fingerprints -o OUTPUTFILE INPUTFILE