X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=match_fingerprints;fp=match_fingerprints;h=4300429df383a55416d9cbd59479b92aab83c387;hp=0000000000000000000000000000000000000000;hb=3ec4fd758b018abda104bbeffa5ebf02b832030b;hpb=07e09b509ccddb459137a98759781b6f8dcb396b diff --git a/match_fingerprints b/match_fingerprints new file mode 100755 index 0000000..4300429 --- /dev/null +++ b/match_fingerprints @@ -0,0 +1,122 @@ +#!/usr/bin/perl +use strict; +use warnings; +use open ':utf8'; + +use Digest::SHA1 qw(sha1_base64); +use Getopt::Long; + +my $conf = {}; # configuration hashref +initialize($conf); + +my %fps = (); # records matching each fingerprint (and the lead) +my @recs = (); # fingerprints belonging to each record +my %seen = (); # records we've already seen +my $lastscore = 0; # previous fingerprint's score + +open FP, '<', $ARGV[0] or die "Can't open input file: $!\n"; + +print "Loading and ranking fingerprints\n"; +while () { + my @fields = split "\t", $_; + my $fp = populate_fingerprint(@fields); + rank_fingerprint($fp); +} +print "Writing matchset to disk\n"; +dump_records(); + + + +sub populate_fingerprint { + my @fields = @_; + my %fp = (); # zero fingerprint hash each time thru + + # populate fp hash -- first the simple data + $fp{compact} = shift @fields; + $fp{json} = shift @fields; + $fp{id} = shift @fields; + # then smash everything else together, remove non-Roman characters, and + # generate a SHA1 hash to represent it + my $stripped = join('', @fields); + $stripped =~ s/[^A-Za-z0-9]//g; + $fp{sha1} = sha1_base64($stripped); + + # make sure file is sorted properly + if ($lastscore and ($fp{compact} > $lastscore)) { + print "Input file is sorted improperly or unsorted.\n"; + die "Sort descending (sort -r) and rerun this script.\n"; + } + $lastscore = $fp{compact}; + + return \%fp; +} + + +sub rank_fingerprint { + my ($fp) = @_; + + my $sha1 = $fp->{sha1}; + my $id = $fp->{id}; + + # only process records which haven't already been seen + unless (defined $seen{$id}) { + unless (defined $fps{$sha1}) { + # haven't seen this fp before. create a new listref to hold subs + # and stow the hash of the fingerprint that we're lead of + $fps{$sha1} = []; + push @recs, {id => $id, sha1 => $sha1}; + } else { + # have seen this fp. push record id onto matchlist + push @{ $fps{$sha1} }, $id; + } + $seen{$id} = 1; + } +} + + +=head2 dump_records + +Writes out a 2-column file of lead and subordinate records. + +=cut + +sub dump_records { + my %used = (); + open OUT, '>', $conf->{output} + or die "Can't open ", $conf->{output}, "$!\n"; + for my $rec (@recs) { + print OUT $rec->{id}, "\t$_\n" + for ( @{ $fps{ $rec->{sha1} } } ); + } +} + +sub initialize { + my ($c) = @_; + my @missing = (); + + # set mode on existing filehandles + binmode(STDIN, ':utf8'); + + my $rc = GetOptions( $c, + 'output|o=s', + 'help|h', + ); + show_help() unless $rc; + show_help() if ($c->{help}); + + my @keys = keys %{$c}; + show_help() unless (@ARGV and @keys); + for my $key ('output') + { push @missing, $key unless $c->{$key} } + if (@missing) { + print "Required option: ", join(', ', @missing), " missing!\n"; + show_help(); + } +} + +sub show_help { + print <