From: Shawn Boyette Date: Mon, 8 Dec 2008 22:52:19 +0000 (+0000) Subject: compress: now detects input sorted improperly X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=commitdiff_plain;h=c25291487fffdb20f740010b48e93fa2c58dcac0 compress: now detects input sorted improperly --- diff --git a/compress_fingerprints b/compress_fingerprints index 4c815fc..a436b75 100755 --- a/compress_fingerprints +++ b/compress_fingerprints @@ -11,6 +11,7 @@ initialize($conf); my %fps = (); # records matching each fingerprint (and the lead) my %recs = (); # fingerprints belonging to each record +my $lastscore = 0; # previous fingerprint's score open FP, '<', $ARGV[0] or die "Can't open input file: $!\n"; @@ -40,6 +41,13 @@ sub populate_fingerprint { $stripped =~ s/[^A-Za-z0-9]//g; $fp{sha1} = sha1_base64($stripped); + # make sure file is sorted properly + if ($lastscore and $fp{compact} > $lastscore) { + print "Input file is sorted improperly or unsorted.\n"; + die "Sort descending (sort -ru) and rerun this script.\n"; + } + $lastscore = $fp{compact}; + # populate records hash $recs{ $fp{id} }{ $fp{sha1} } = {}; @@ -80,7 +88,9 @@ sub rank_fingerprint { =head2 dump_records -Writes out a 2-column file of lead and subordinate records. +Writes out a 2-column file of lead and subordinate records. If +posttest is enabled, a scan is also done to ensure that no recordid +appears as both a subordinate and lead. =cut