new edit check for 2+ chars in subfield code

[migration-tools.git] / compress_fingerprints
diff --git a/compress_fingerprints b/compress_fingerprints

old mode 100644 (file)

new mode 100755 (executable)

index f912e87..4300429
--- a/compress_fingerprints
+++ b/compress_fingerprints
@@ -10,30 +10,43 @@ my $conf  = {}; # configuration hashref
 initialize($conf);
 
 my %fps  = (); # records matching each fingerprint (and the lead)
-my %recs = (); # fingerprints belonging to each record
+my @recs = (); # fingerprints belonging to each record
+my %seen = (); # records we've already seen
+my $lastscore = 0; # previous fingerprint's score
 
-open FP, '<', $ARGV[0];
+open FP, '<', $ARGV[0] or die "Can't open input file: $!\n";
+
+print "Loading and ranking fingerprints\n";
 while (<FP>) {
     my @fields = split "\t", $_;
     my $fp = populate_fingerprint(@fields);
     rank_fingerprint($fp);
 }
+print "Writing matchset to disk\n";
 dump_records();
 
 
 
 sub populate_fingerprint {
     my @fields = @_;
-    my %fp = ();
+    my %fp = (); # zero fingerprint hash each time thru
 
-    # populate fp hash
+    # populate fp hash -- first the simple data
     $fp{compact} = shift @fields;
     $fp{json}    = shift @fields;
     $fp{id}      = shift @fields;
-    $fp{sha1}    = sha1_base64( join('', @fields) );
-
-    # populate records hash
-    $recs{ $fp{id} }{ $fp{sha1} } = { exist => 1 };
+    # then smash everything else together, remove non-Roman characters, and
+    # generate a SHA1 hash to represent it
+    my $stripped = join('', @fields);
+    $stripped   =~ s/[^A-Za-z0-9]//g;
+    $fp{sha1}    = sha1_base64($stripped);
+
+    # make sure file is sorted properly
+    if ($lastscore and ($fp{compact} > $lastscore)) {
+        print "Input file is sorted improperly or unsorted.\n";
+        die "Sort descending (sort -r) and rerun this script.\n";
+    }
+    $lastscore = $fp{compact};
 
     return \%fp;
 }
@@ -44,36 +57,36 @@ sub rank_fingerprint {
 
     my $sha1 = $fp->{sha1};
     my $id   = $fp->{id};
-    unless ($fps{$sha1}) {
-        # haven't seen this fp before. create a new hashref with the current
-        # record as lead
-        $fps{$sha1} = { lead => { id    => $id,
-                                  score => $fp->{compact} },
-                        recs => [ $id ] };
-        $recs{$id}{$sha1}{lead} = 1;
-    } else {
-        # have seen this fp. push record id onto matchlist
-        push @{ $fps{$sha1}{recs} }, $id;
-        # and set this record as lead if it scores higher than current lead
-        if ($fp->{compact} > $fps{$sha1}{lead}{score}) {
-            $recs{ $fps{$sha1}{lead}{id} }{$sha1}{lead} = 0;
-            $recs{ $id }{$sha1}{lead} = 1;
-            $fps{$sha1}{lead}{id}    = $id;
-            $fps{$sha1}{lead}{score} = $fp->{compact};
+
+    # only process records which haven't already been seen
+    unless (defined $seen{$id}) {
+        unless (defined $fps{$sha1}) {
+            # haven't seen this fp before. create a new listref to hold subs
+            # and stow the hash of the fingerprint that we're lead of
+            $fps{$sha1} = [];
+            push @recs, {id => $id, sha1 => $sha1};
+        } else {
+            # have seen this fp. push record id onto matchlist
+            push @{ $fps{$sha1} }, $id;
         }
+        $seen{$id} = 1;
     }
 }
 
 
-sub dump_fingerprints {
+=head2 dump_records
+
+Writes out a 2-column file of lead and subordinate records.
+
+=cut
+
+sub dump_records {
+    my %used = ();
     open OUT, '>', $conf->{output}
       or die "Can't open ", $conf->{output}, "$!\n";
-    for my $id (keys %recs) {
-        for my $sha1 ( keys %{$recs{$id}} ) {
-            next unless $recs{$id}{$sha1}{lead};
-            for my $subid ( @{$fps{$sha1}{recs}} )
-              { print OUT "$id\t$subid\n" }
-        }
+    for my $rec (@recs) {
+        print OUT $rec->{id}, "\t$_\n"
+          for ( @{ $fps{ $rec->{sha1} } } );
     }
 }