3 # Copyright 2009-2012, Equinox Software, Inc.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
24 use Digest::SHA1 qw(sha1_base64);
27 my $conf = {}; # configuration hashref
30 my %fps = (); # records matching each fingerprint (and the lead)
31 my @recs = (); # fingerprints belonging to each record
32 my %seen = (); # records we've already seen
33 my $lastscore = 0; # previous fingerprint's score
35 my %subs = (); # error-checking hashe
37 open FP, '<', $ARGV[0] or die "Can't open input file: $!\n";
39 print "Loading and ranking fingerprints\n";
41 my @fields = split "\t", $_;
42 my $fp = populate_fingerprint(@fields);
43 rank_fingerprint($fp);
45 print "Writing matchset to disk\n";
50 sub populate_fingerprint {
52 my %fp = (); # zero fingerprint hash each time thru
54 # populate fp hash -- first the simple data
55 $fp{compact} = shift @fields;
56 $fp{json} = shift @fields;
57 $fp{id} = shift @fields;
58 # then smash everything else together, remove non-Roman characters, and
59 # generate a SHA1 hash to represent it
60 my $stripped = join('', @fields);
61 $stripped =~ s/[^A-Za-z0-9]//g;
62 $fp{sha1} = sha1_base64($stripped);
64 # make sure file is sorted properly
65 if ($lastscore and ($fp{compact} < $lastscore)) {
66 print "Input file is sorted improperly or unsorted.\n";
67 die "Sort descending (sort -r) and rerun this script.\n";
69 $lastscore = $fp{compact};
75 sub rank_fingerprint {
78 my $sha1 = $fp->{sha1};
81 # only process records which haven't already been seen
82 #unless (defined $seen{$id}) {
83 unless (defined $fps{$sha1}) {
84 # haven't seen this fp before. create a new listref to hold subs
85 # and stow the hash of the fingerprint that we're lead of
87 push @recs, {id => $id, sha1 => $sha1};
89 # have seen this fp. push record id onto matchlist
90 push @{ $fps{$sha1} }, $id;
99 Writes out a 2-column file of lead and subordinate records.
105 open OUT, '>', $conf->{output}
106 or die "Can't open ", $conf->{output}, "$!\n";
107 for my $rec (@recs) {
108 for ( @{ $fps{ $rec->{sha1} } } ) {
109 # check for dupes and die if they exist
110 #die "Collision: dupe sub record $_\n" if $subs{$_};
111 if( not exists ($subs{$_}) ) {
112 #die "Collision: lead in sub list ", $rec->{id}, "\n"
113 # if $subs{ $rec->{id} };
114 my $lead = $rec->{id};
116 # we don't need to match onto itself
117 next if ($lead eq $_);
119 if ($subs{ $rec->{id} })
121 print "moving ".$lead." to ".$subs{$rec->{id}}."\n";
122 $lead = $subs{$rec->{id}};
127 print "looking at ".$rec->{sha1};
130 # we don't want subs below threshold
131 next if ($_ < $conf->{threshold});
133 # we don't need to match onto itself
134 next if ($lead eq $_);
136 # record this sub having this leader
139 # still here? output.
140 print OUT $lead, "\t$_\n"
150 # set mode on existing filehandles
151 binmode(STDIN, ':utf8');
153 my $rc = GetOptions( $c,
158 show_help() unless $rc;
159 show_help() if ($c->{help});
161 my @keys = keys %{$c};
162 show_help() unless (@ARGV and @keys);
163 for my $key ('output')
164 { push @missing, $key unless $c->{$key} }
166 print "Required option: ", join(', ', @missing), " missing!\n";
170 $c->{threshold} = 0 unless $c->{threshold};
175 Usage is: compress_fingerprints [-t THRESHOLD] -o OUTPUTFILE INPUTFILE