my %seen = (); # records we've already seen
my $lastscore = 0; # previous fingerprint's score
+my %subs = (); # error-checking hashe
+
open FP, '<', $ARGV[0] or die "Can't open input file: $!\n";
print "Loading and ranking fingerprints\n";
open OUT, '>', $conf->{output}
or die "Can't open ", $conf->{output}, "$!\n";
for my $rec (@recs) {
- print OUT $rec->{id}, "\t$_\n"
- for ( @{ $fps{ $rec->{sha1} } } );
+ for ( @{ $fps{ $rec->{sha1} } } ) {
+ # check for dupes and die if they exist
+ die "Collision: dupe sub record $_\n" if $subs{$_};
+ $subs{$_} = 1;
+ die "Collision: lead in sub list ", $rec->{id}, "\n"
+ if $subs{ $rec->{id} };
+
+ # we don't want subs below threshold
+ next if ($_ < $conf->{threshold});
+
+ # still here? output.
+ print OUT $rec->{id}, "\t$_\n"
+ }
}
}
my $rc = GetOptions( $c,
'output|o=s',
+ 'threshold|t=i',
'help|h',
);
show_help() unless $rc;
print "Required option: ", join(', ', @missing), " missing!\n";
show_help();
}
+
+ $c->{threshold} = 0 unless $c->{threshold};
}
sub show_help {
print <<HELP;
-Usage is: compress_fingerprints -o OUTPUTFILE INPUTFILE
+Usage is: compress_fingerprints [-t THRESHOLD] -o OUTPUTFILE INPUTFILE
HELP
exit;
}