6 use Digest::SHA1 qw(sha1_base64);
9 my $conf = {}; # configuration hashref
12 my %fps = (); # records matching each fingerprint (and the lead)
13 my %recs = (); # fingerprints belonging to each record
14 my $lastscore = 0; # previous fingerprint's score
16 open FP, '<', $ARGV[0] or die "Can't open input file: $!\n";
18 print "Loading and ranking fingerprints\n";
20 my @fields = split "\t", $_;
21 my $fp = populate_fingerprint(@fields);
22 rank_fingerprint($fp);
23 print "\r", ( int($i / $total) ), "% complete" unless ($i % 1000);
25 print "Writing matchset to disk\n";
30 sub populate_fingerprint {
32 my %fp = (); # zero fingerprint hash each time thru
34 # populate fp hash -- first the simple data
35 $fp{compact} = shift @fields;
36 $fp{json} = shift @fields;
37 $fp{id} = shift @fields;
38 # then smash everything else together, remove non-Roman characters, and
39 # generate a SHA1 hash to represent it
40 my $stripped = join('', @fields);
41 $stripped =~ s/[^A-Za-z0-9]//g;
42 $fp{sha1} = sha1_base64($stripped);
44 # make sure file is sorted properly
45 if ($lastscore and $fp{compact} > $lastscore) {
46 print "Input file is sorted improperly or unsorted.\n";
47 die "Sort descending (sort -ru) and rerun this script.\n";
49 $lastscore = $fp{compact};
51 # populate records hash
52 $recs{ $fp{id} }{ $fp{sha1} } = {};
58 sub rank_fingerprint {
61 my $sha1 = $fp->{sha1};
63 my $islead = $recs{$id}{lead};
65 # only process records which haven't already been set as a sub
66 unless (defined $islead and $islead) {
67 unless ($fps{$sha1}) {
68 # haven't seen this fp before. create a new hashref with the current
70 $fps{$sha1} = { lead => { id => $id,
71 score => $fp->{compact} },
75 # have seen this fp. push record id onto matchlist
76 push @{ $fps{$sha1}{recs} }, $id;
77 if ($fp->{compact} > $fps{$sha1}{lead}{score}) {
78 # and set this record as lead if it scores higher than current lead
79 $recs{ $fps{$sha1}{lead}{id} }{lead} = 0; # unset current
80 $recs{ $id }{lead} = 1; # set new as lead
81 $fps{$sha1}{lead}{id} = $id;
82 $fps{$sha1}{lead}{score} = $fp->{compact};
91 Writes out a 2-column file of lead and subordinate records. If
92 posttest is enabled, a scan is also done to ensure that no recordid
93 appears as both a subordinate and lead.
99 open OUT, '>', $conf->{output}
100 or die "Can't open ", $conf->{output}, "$!\n";
101 for my $id (keys %recs) {
102 next unless $recs{$id}{lead};
103 for my $sha1 ( keys %{$recs{$id}} ) {
104 for my $subid ( @{$fps{$sha1}{recs}} ) {
105 next if ($id == $subid);
106 next if defined $used{$subid};
108 print OUT "$id\t$subid\n";
118 # set mode on existing filehandles
119 binmode(STDIN, ':utf8');
121 my $rc = GetOptions( $c,
125 show_help() unless $rc;
126 show_help() if ($c->{help});
128 my @keys = keys %{$c};
129 show_help() unless (@ARGV and @keys);
130 for my $key ('output')
131 { push @missing, $key unless $c->{$key} }
133 print "Required option: ", join(', ', @missing), " missing!\n";
140 Usage is: compress_fingerprints -o OUTPUTFILE INPUTFILE