#!/usr/bin/perl
+
+# Copyright 2009-2012, Equinox Software, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
use strict;
use warnings;
use open ':utf8';
$fp{sha1} = sha1_base64($stripped);
# make sure file is sorted properly
- if ($lastscore and ($fp{compact} > $lastscore)) {
+ if ($lastscore and ($fp{compact} < $lastscore)) {
print "Input file is sorted improperly or unsorted.\n";
die "Sort descending (sort -r) and rerun this script.\n";
}
my $id = $fp->{id};
# only process records which haven't already been seen
- unless (defined $seen{$id}) {
+ #unless (defined $seen{$id}) {
unless (defined $fps{$sha1}) {
# haven't seen this fp before. create a new listref to hold subs
# and stow the hash of the fingerprint that we're lead of
push @{ $fps{$sha1} }, $id;
}
$seen{$id} = 1;
- }
+ #}
}
for my $rec (@recs) {
for ( @{ $fps{ $rec->{sha1} } } ) {
# check for dupes and die if they exist
- die "Collision: dupe sub record $_\n" if $subs{$_};
- $subs{$_} = 1;
- die "Collision: lead in sub list ", $rec->{id}, "\n"
- if $subs{ $rec->{id} };
-
- # we don't want subs below threshold
- next if ($_ < $conf->{threshold});
-
- # still here? output.
- print OUT $rec->{id}, "\t$_\n"
+ #die "Collision: dupe sub record $_\n" if $subs{$_};
+ if( not exists ($subs{$_}) ) {
+ #die "Collision: lead in sub list ", $rec->{id}, "\n"
+ # if $subs{ $rec->{id} };
+ my $lead = $rec->{id};
+
+ # we don't need to match onto itself
+ next if ($lead eq $_);
+
+ if ($subs{ $rec->{id} })
+ {
+ print "moving ".$lead." to ".$subs{$rec->{id}}."\n";
+ $lead = $subs{$rec->{id}};
+ }
+
+ if($_ eq '1459051')
+ {
+ print "looking at ".$rec->{sha1};
+ }
+
+ # we don't want subs below threshold
+ next if ($_ < $conf->{threshold});
+
+ # we don't need to match onto itself
+ next if ($lead eq $_);
+
+ # record this sub having this leader
+ $subs{$_} = $lead;
+
+ # still here? output.
+ print OUT $lead, "\t$_\n"
+ }
}
}
}