initialize($conf);
my %fps = (); # records matching each fingerprint (and the lead)
-my %recs = (); # fingerprints belonging to each record
+my @recs = (); # fingerprints belonging to each record
+my %seen = (); # records we've already seen
+my $lastscore = 0; # previous fingerprint's score
open FP, '<', $ARGV[0] or die "Can't open input file: $!\n";
-my $count = 0;
-my $total = `wc -l $ARGV[0]`;
-
print "Loading and ranking fingerprints\n";
while (<FP>) {
my @fields = split "\t", $_;
my $fp = populate_fingerprint(@fields);
rank_fingerprint($fp);
}
-print "$total fingerprints processed\n";
-print "$count records set as leads\n"'
print "Writing matchset to disk\n";
dump_records();
$stripped =~ s/[^A-Za-z0-9]//g;
$fp{sha1} = sha1_base64($stripped);
- # populate records hash
- $recs{ $fp{id} }{ $fp{sha1} } = {};
+ # make sure file is sorted properly
+ if ($lastscore and ($fp{compact} > $lastscore)) {
+ print "Input file is sorted improperly or unsorted.\n";
+ die "Sort descending (sort -r) and rerun this script.\n";
+ }
+ $lastscore = $fp{compact};
return \%fp;
}
my $sha1 = $fp->{sha1};
my $id = $fp->{id};
- my $islead = $recs{$id}{lead};
-
- # only process records which haven't already been set as a sub
- unless (defined $islead and $islead) {
- unless ($fps{$sha1}) {
- # haven't seen this fp before. create a new hashref with the current
- # record as lead
- $fps{$sha1} = { lead => { id => $id,
- score => $fp->{compact} },
- recs => [ $id ] };
- $recs{$id}{lead} = 1;
- $count++;
+
+ # only process records which haven't already been seen
+ unless (defined $seen{$id}) {
+ unless (defined $fps{$sha1}) {
+ # haven't seen this fp before. create a new listref to hold subs
+ # and stow the hash of the fingerprint that we're lead of
+ $fps{$sha1} = [];
+ push @recs, {id => $id, sha1 => $sha1};
} else {
# have seen this fp. push record id onto matchlist
- push @{ $fps{$sha1}{recs} }, $id;
- if ($fp->{compact} > $fps{$sha1}{lead}{score}) {
- # and set this record as lead if it scores higher than current lead
- $recs{ $fps{$sha1}{lead}{id} }{lead} = 0; # unset current
- $recs{ $id }{lead} = 1; # set new as lead
- $fps{$sha1}{lead}{id} = $id;
- $fps{$sha1}{lead}{score} = $fp->{compact};
- }
+ push @{ $fps{$sha1} }, $id;
}
+ $seen{$id} = 1;
}
}
my %used = ();
open OUT, '>', $conf->{output}
or die "Can't open ", $conf->{output}, "$!\n";
- for my $id (keys %recs) {
- next unless $recs{$id}{lead};
- for my $sha1 ( keys %{$recs{$id}} ) {
- for my $subid ( @{$fps{$sha1}{recs}} ) {
- next if ($id == $subid);
- next if defined $used{$subid};
- $used{$subid} = 1;
- print OUT "$id\t$subid\n";
- }
- }
+ for my $rec (@recs) {
+ print OUT $rec->{id}, "\t$_\n"
+ for ( @{ $fps{ $rec->{sha1} } } );
}
}