3 my $dataset = $ARGV[0];
5 my $match_to = $ARGV[1];
6 my $match_these = $ARGV[2];
7 my $match_to_score = $ARGV[3];
8 my $match_these_score = $ARGV[4];
10 print "match_to: $match_to match_these: $match_these\n";
19 # create HOL of incumbent{fingerprint}[ids]
21 while (my $line = <FILE>) {
23 my @fields = split(/\t/,$line);
24 my $id = shift @fields;
25 my $fp = join '^', @fields;
26 if (! defined $pines{ $fp }) { $pines{ $fp } = []; }
27 push @{ $pines{ $fp } }, $id;
31 # do the same for incoming
32 open FILE, $match_these;
33 while (my $line = <FILE>) {
35 my @fields = split(/\t/,$line);
36 my $id = shift @fields;
37 my $fp = join '^', @fields;
38 if (! defined $incoming{ $fp }) { $incoming{ $fp } = []; }
39 push @{ $incoming{ $fp } }, $id;
43 # scoring file stuffs, which i have never used
44 foreach my $file ( $match_to_score, $match_from_score ) {
46 while (my $line = <FILE>) {
48 my @fields = split(/\|/,$line);
49 my $id = shift @fields; $id =~ s/\D//g;
50 my $holdings = shift @fields; $holdings =~ s/\D//g;
51 my $subtitle = shift @fields; $subtitle =~ s/^\s+//; $subtitle =~ s/\s+$//;
52 $score{ $id } = [ $holdings, $subtitle ];
58 open RECORD_IDS, ">match.record_ids";
59 foreach my $fp ( keys %incoming ) {
60 # for each incoming fingerprint,
61 if (defined $pines{ $fp }) {
62 # if there is a matching incumbent fingerprint
63 foreach my $id ( @{ $incoming{ $fp } } ) {
64 # print all incoming record ids
65 print RECORD_IDS "$id\n";
66 if ( ! defined $candidate_match{ $id } )
67 # and create a mapping of incoming ids to fingerprints
69 { $candidate_match{ $id } = []; }
70 push @{ $candidate_match{ $id } }, $fp;
77 # scoring section, which i have never used
78 foreach my $id ( keys %candidate_match ) {
80 # if score{id} exists set subtitle to the sc
81 if (defined $score{ $id })
82 { $subtitle = $score{ $id }[1]; }
84 my @fps = @{ $candidate_match{ $id } };
85 my @candidate_pines = ();
87 my $subtitle_matched = 0;
88 my $highest_holdings = 0;
91 foreach my $fp ( @fps ) {
92 foreach my $pines_id ( @{ $pines{ $fp } } ) {
94 if (defined $score{ $pines_id })
95 { $pines_subtitle = $score{ $pines_id }[1]; }
97 if (defined $score{ $pines_id })
98 { $pines_holdings = $score{ $pines_id }[0]; }
99 if ($pines_subtitle eq $subtitle) {
100 if (! $subtitle_matched) {
101 $subtitle_matched = 1;
102 $best_pines_id = $pines_id;
103 $highest_holdings = -1;
106 if ($subtitle_matched) { next; }
108 if ( $pines_holdings > $highest_holdings ) {
109 $highest_holdings = $pines_holdings;
110 $best_pines_id = $pines_id;
115 # this will silently fail, as the filehandle has been closed, but strict
117 print RECORD_IDS "$best_pines_id\n";
119 if (! defined $match{ $best_pines_id } )
120 { $match{ $best_pines_id } = [ $best_pines_id ]; }
121 push @{ $match{ $best_pines_id } }, $id;
126 open GROUPINGS, ">match.groupings";
127 foreach my $k ( keys %match ) {
128 print GROUPINGS join("^",
132 join(",",@{ $match{ $k } }),
133 join(",",@{ $match{ $k } })