#!/usr/bin/perl use strict; use warnings; use open ':utf8'; use Digest::SHA1 qw(sha1_base64); use Getopt::Long; my $conf = {}; # configuration hashref initialize($conf); my %fps = (); # records matching each fingerprint (and the lead) my %recs = (); # fingerprints belonging to each record open FP, '<', $ARGV[0]; while () { my @fields = split "\t", $_; my $fp = populate_fingerprint(@fields); rank_fingerprint($fp); } dump_records(); sub populate_fingerprint { my @fields = @_; my %fp = (); # populate fp hash $fp{compact} = shift @fields; $fp{json} = shift @fields; $fp{id} = shift @fields; my $stripped = join('', @fields); $stripped =~ s/[^A-Za-z0-9]//g; $fp{sha1} = sha1_base64($stripped); # populate records hash $recs{ $fp{id} }{ $fp{sha1} } = {}; return \%fp; } sub rank_fingerprint { my ($fp) = @_; my $sha1 = $fp->{sha1}; my $id = $fp->{id}; my $islead = $recs{$id}{lead}; unless (defined $islead and $islead) { # only process records which haven't already been set as a sub unless ($fps{$sha1}) { # haven't seen this fp before. create a new hashref with the current # record as lead $fps{$sha1} = { lead => { id => $id, score => $fp->{compact} }, recs => [ $id ] }; $recs{$id}{$sha1}{lead} = 1; } else { # have seen this fp. push record id onto matchlist push @{ $fps{$sha1}{recs} }, $id; # and set this record as lead if it scores higher than current lead if ($fp->{compact} > $fps{$sha1}{lead}{score}) { $recs{ $fps{$sha1}{lead}{id} }{lead} = 0; $recs{ $id }{lead} = 1; $fps{$sha1}{lead}{id} = $id; $fps{$sha1}{lead}{score} = $fp->{compact}; } } } } sub dump_records { my %used = (); open OUT, '>', $conf->{output} or die "Can't open ", $conf->{output}, "$!\n"; for my $id (keys %recs) { next unless $recs{$id}{lead}; for my $sha1 ( keys %{$recs{$id}} ) { for my $subid ( @{$fps{$sha1}{recs}} ) { next if ($id == $subid); next if defined $used{$subid}; $used{$subid} = 1; print OUT "$id\t$subid\n"; } } } } sub initialize { my ($c) = @_; my @missing = (); # set mode on existing filehandles binmode(STDIN, ':utf8'); my $rc = GetOptions( $c, 'output|o=s', 'help|h', ); show_help() unless $rc; show_help() if ($c->{help}); my @keys = keys %{$c}; show_help() unless (@ARGV and @keys); for my $key ('output') { push @missing, $key unless $c->{$key} } if (@missing) { print "Required option: ", join(', ', @missing), " missing!\n"; show_help(); } } sub show_help { print <