#!/usr/bin/perl
+
+# Copyright 2009-2012, Equinox Software, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+
use strict;
use warnings;
use open ':utf8';
my %seen = (); # records we've already seen
my $lastscore = 0; # previous fingerprint's score
-my %leads = (); # error-checking hashes
-my %subs = ();
+my %subs = (); # error-checking hashe
open FP, '<', $ARGV[0] or die "Can't open input file: $!\n";
# check for dupes and die if they exist
die "Collision: dupe sub record $_\n" if $subs{$_};
$subs{$_} = 1;
- die "Collision: dupe lead record ", $rec->{id}, "\n"
- if $leads{ $rec->{id} };
- $leads{ $rec->{id} } = 1;
die "Collision: lead in sub list ", $rec->{id}, "\n"
if $subs{ $rec->{id} };
+ # we don't want subs below threshold
+ next if ($_ < $conf->{threshold});
+
# still here? output.
print OUT $rec->{id}, "\t$_\n"
}
my $rc = GetOptions( $c,
'output|o=s',
+ 'threshold|t=i',
'help|h',
);
show_help() unless $rc;
print "Required option: ", join(', ', @missing), " missing!\n";
show_help();
}
+
+ $c->{threshold} = 0 unless $c->{threshold};
}
sub show_help {
print <<HELP;
-Usage is: compress_fingerprints -o OUTPUTFILE INPUTFILE
+Usage is: compress_fingerprints [-t THRESHOLD] -o OUTPUTFILE INPUTFILE
HELP
exit;
}