6 use Digest::SHA1 qw(sha1_base64);
9 my $conf = {}; # configuration hashref
12 my %fps = (); # records matching each fingerprint (and the lead)
13 my @recs = (); # fingerprints belonging to each record
14 my %seen = (); # records we've already seen
15 my $lastscore = 0; # previous fingerprint's score
17 my %leads = (); # error-checking hashes
20 open FP, '<', $ARGV[0] or die "Can't open input file: $!\n";
22 print "Loading and ranking fingerprints\n";
24 my @fields = split "\t", $_;
25 my $fp = populate_fingerprint(@fields);
26 rank_fingerprint($fp);
28 print "Writing matchset to disk\n";
33 sub populate_fingerprint {
35 my %fp = (); # zero fingerprint hash each time thru
37 # populate fp hash -- first the simple data
38 $fp{compact} = shift @fields;
39 $fp{json} = shift @fields;
40 $fp{id} = shift @fields;
41 # then smash everything else together, remove non-Roman characters, and
42 # generate a SHA1 hash to represent it
43 my $stripped = join('', @fields);
44 $stripped =~ s/[^A-Za-z0-9]//g;
45 $fp{sha1} = sha1_base64($stripped);
47 # make sure file is sorted properly
48 if ($lastscore and ($fp{compact} > $lastscore)) {
49 print "Input file is sorted improperly or unsorted.\n";
50 die "Sort descending (sort -r) and rerun this script.\n";
52 $lastscore = $fp{compact};
58 sub rank_fingerprint {
61 my $sha1 = $fp->{sha1};
64 # only process records which haven't already been seen
65 unless (defined $seen{$id}) {
66 unless (defined $fps{$sha1}) {
67 # haven't seen this fp before. create a new listref to hold subs
68 # and stow the hash of the fingerprint that we're lead of
70 push @recs, {id => $id, sha1 => $sha1};
72 # have seen this fp. push record id onto matchlist
73 push @{ $fps{$sha1} }, $id;
82 Writes out a 2-column file of lead and subordinate records.
88 open OUT, '>', $conf->{output}
89 or die "Can't open ", $conf->{output}, "$!\n";
91 for ( @{ $fps{ $rec->{sha1} } } ) {
92 # check for dupes and die if they exist
93 die "Collision: dupe sub record $_\n" if $subs{$_};
95 die "Collision: dupe lead record ", $rec->{id}, "\n"
96 if $leads{ $rec->{id} };
97 $leads{ $rec->{id} } = 1;
98 die "Collision: lead in sub list ", $rec->{id}, "\n"
99 if $subs{ $rec->{id} };
101 # still here? output.
102 print OUT $rec->{id}, "\t$_\n"
111 # set mode on existing filehandles
112 binmode(STDIN, ':utf8');
114 my $rc = GetOptions( $c,
118 show_help() unless $rc;
119 show_help() if ($c->{help});
121 my @keys = keys %{$c};
122 show_help() unless (@ARGV and @keys);
123 for my $key ('output')
124 { push @missing, $key unless $c->{$key} }
126 print "Required option: ", join(', ', @missing), " missing!\n";
133 Usage is: compress_fingerprints -o OUTPUTFILE INPUTFILE