From 3ec4fd758b018abda104bbeffa5ebf02b832030b Mon Sep 17 00:00:00 2001 From: Shawn Boyette Date: Thu, 15 Jan 2009 14:41:33 +0000 Subject: [PATCH] renaming compress_fingerprints --- compress_fingerprints | 122 ------------------------------------------------- match_fingerprints | 122 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 122 deletions(-) delete mode 100755 compress_fingerprints create mode 100755 match_fingerprints diff --git a/compress_fingerprints b/compress_fingerprints deleted file mode 100755 index 4300429..0000000 --- a/compress_fingerprints +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; -use open ':utf8'; - -use Digest::SHA1 qw(sha1_base64); -use Getopt::Long; - -my $conf = {}; # configuration hashref -initialize($conf); - -my %fps = (); # records matching each fingerprint (and the lead) -my @recs = (); # fingerprints belonging to each record -my %seen = (); # records we've already seen -my $lastscore = 0; # previous fingerprint's score - -open FP, '<', $ARGV[0] or die "Can't open input file: $!\n"; - -print "Loading and ranking fingerprints\n"; -while () { - my @fields = split "\t", $_; - my $fp = populate_fingerprint(@fields); - rank_fingerprint($fp); -} -print "Writing matchset to disk\n"; -dump_records(); - - - -sub populate_fingerprint { - my @fields = @_; - my %fp = (); # zero fingerprint hash each time thru - - # populate fp hash -- first the simple data - $fp{compact} = shift @fields; - $fp{json} = shift @fields; - $fp{id} = shift @fields; - # then smash everything else together, remove non-Roman characters, and - # generate a SHA1 hash to represent it - my $stripped = join('', @fields); - $stripped =~ s/[^A-Za-z0-9]//g; - $fp{sha1} = sha1_base64($stripped); - - # make sure file is sorted properly - if ($lastscore and ($fp{compact} > $lastscore)) { - print "Input file is sorted improperly or unsorted.\n"; - die "Sort descending (sort -r) and rerun this script.\n"; - } - $lastscore = $fp{compact}; - - return \%fp; -} - - -sub rank_fingerprint { - my ($fp) = @_; - - my $sha1 = $fp->{sha1}; - my $id = $fp->{id}; - - # only process records which haven't already been seen - unless (defined $seen{$id}) { - unless (defined $fps{$sha1}) { - # haven't seen this fp before. create a new listref to hold subs - # and stow the hash of the fingerprint that we're lead of - $fps{$sha1} = []; - push @recs, {id => $id, sha1 => $sha1}; - } else { - # have seen this fp. push record id onto matchlist - push @{ $fps{$sha1} }, $id; - } - $seen{$id} = 1; - } -} - - -=head2 dump_records - -Writes out a 2-column file of lead and subordinate records. - -=cut - -sub dump_records { - my %used = (); - open OUT, '>', $conf->{output} - or die "Can't open ", $conf->{output}, "$!\n"; - for my $rec (@recs) { - print OUT $rec->{id}, "\t$_\n" - for ( @{ $fps{ $rec->{sha1} } } ); - } -} - -sub initialize { - my ($c) = @_; - my @missing = (); - - # set mode on existing filehandles - binmode(STDIN, ':utf8'); - - my $rc = GetOptions( $c, - 'output|o=s', - 'help|h', - ); - show_help() unless $rc; - show_help() if ($c->{help}); - - my @keys = keys %{$c}; - show_help() unless (@ARGV and @keys); - for my $key ('output') - { push @missing, $key unless $c->{$key} } - if (@missing) { - print "Required option: ", join(', ', @missing), " missing!\n"; - show_help(); - } -} - -sub show_help { - print <) { + my @fields = split "\t", $_; + my $fp = populate_fingerprint(@fields); + rank_fingerprint($fp); +} +print "Writing matchset to disk\n"; +dump_records(); + + + +sub populate_fingerprint { + my @fields = @_; + my %fp = (); # zero fingerprint hash each time thru + + # populate fp hash -- first the simple data + $fp{compact} = shift @fields; + $fp{json} = shift @fields; + $fp{id} = shift @fields; + # then smash everything else together, remove non-Roman characters, and + # generate a SHA1 hash to represent it + my $stripped = join('', @fields); + $stripped =~ s/[^A-Za-z0-9]//g; + $fp{sha1} = sha1_base64($stripped); + + # make sure file is sorted properly + if ($lastscore and ($fp{compact} > $lastscore)) { + print "Input file is sorted improperly or unsorted.\n"; + die "Sort descending (sort -r) and rerun this script.\n"; + } + $lastscore = $fp{compact}; + + return \%fp; +} + + +sub rank_fingerprint { + my ($fp) = @_; + + my $sha1 = $fp->{sha1}; + my $id = $fp->{id}; + + # only process records which haven't already been seen + unless (defined $seen{$id}) { + unless (defined $fps{$sha1}) { + # haven't seen this fp before. create a new listref to hold subs + # and stow the hash of the fingerprint that we're lead of + $fps{$sha1} = []; + push @recs, {id => $id, sha1 => $sha1}; + } else { + # have seen this fp. push record id onto matchlist + push @{ $fps{$sha1} }, $id; + } + $seen{$id} = 1; + } +} + + +=head2 dump_records + +Writes out a 2-column file of lead and subordinate records. + +=cut + +sub dump_records { + my %used = (); + open OUT, '>', $conf->{output} + or die "Can't open ", $conf->{output}, "$!\n"; + for my $rec (@recs) { + print OUT $rec->{id}, "\t$_\n" + for ( @{ $fps{ $rec->{sha1} } } ); + } +} + +sub initialize { + my ($c) = @_; + my @missing = (); + + # set mode on existing filehandles + binmode(STDIN, ':utf8'); + + my $rc = GetOptions( $c, + 'output|o=s', + 'help|h', + ); + show_help() unless $rc; + show_help() if ($c->{help}); + + my @keys = keys %{$c}; + show_help() unless (@ARGV and @keys); + for my $key ('output') + { push @missing, $key unless $c->{$key} } + if (@missing) { + print "Required option: ", join(', ', @missing), " missing!\n"; + show_help(); + } +} + +sub show_help { + print <