fingerprinter tweak
[migration-tools.git] / match_fingerprints.pl
1 #!/usr/bin/perl
2
3 my $dataset = $ARGV[0];
4
5 my $match_to = $ARGV[1];
6 my $match_these = $ARGV[2];
7 my $match_to_score = $ARGV[3];
8 my $match_these_score = $ARGV[4];
9
10 print "match_to: $match_to match_these: $match_these\n";
11
12 my %pines;
13 my %incoming;
14 my %match;
15 my %candidate_match;
16 my %score;
17
18 open FILE, $match_to;
19 while (my $line = <FILE>) {
20         chomp $line;
21         my @fields = split(/\t/,$line);
22         my $id = shift @fields;
23         my $fp = join '^', @fields;
24         if (! defined $pines{ $fp }) { $pines{ $fp } = []; }
25         push @{ $pines{ $fp } }, $id;
26 }
27 close FILE;
28
29 open FILE, $match_these;
30 while (my $line = <FILE>) {
31         chomp $line;
32         my @fields = split(/\t/,$line);
33         my $id = shift @fields;
34         my $fp = join '^', @fields;
35         if (! defined $incoming{ $fp }) { $incoming{ $fp } = []; }
36         push @{ $incoming{ $fp } }, $id;
37 }
38 close FILE;
39
40 foreach my $file ( $match_to_score, $match_from_score ) {
41         open FILE, $file;
42         while (my $line = <FILE>) {
43                 chomp $line;
44                 my @fields = split(/\|/,$line);
45                 my $id = shift @fields; $id =~ s/\D//g;
46                 my $holdings = shift @fields; $holdings =~ s/\D//g;
47                 my $subtitle = shift @fields; $subtitle =~ s/^\s+//; $subtitle =~ s/\s+$//;
48                 $score{ $id } = [ $holdings, $subtitle ];
49         }
50         close FILE;
51 }
52
53 open RECORD_IDS, ">match.record_ids";
54 foreach my $fp ( keys %incoming ) {
55
56         if (defined $pines{ $fp }) { # match!
57
58                 foreach my $id ( @{ $incoming{ $fp } } ) {
59
60                         print RECORD_IDS "$id\n";
61                         if ( ! defined $candidate_match{ $id } ) { $candidate_match{ $id } = []; }
62                         push @{ $candidate_match{ $id } }, $fp;
63                 }
64         }
65 }
66 close RECORD_IDS;
67
68 foreach my $id ( keys %candidate_match ) {
69
70         my $subtitle; if (defined $score{ $id }) { $subtitle = $score{ $id }[1]; }
71
72         my @fps = @{ $candidate_match{ $id } };
73         my @candidate_pines = ();
74
75         my $subtitle_matched = 0;
76         my $highest_holdings = 0;
77         my $best_pines_id;
78
79         foreach my $fp ( @fps ) {
80                 foreach my $pines_id ( @{ $pines{ $fp } } )  {
81                         my $pines_subtitle; if (defined $score{ $pines_id }) { $pines_subtitle = $score{ $pines_id }[1]; }
82                         my $pines_holdings; if (defined $score{ $pines_id }) { $pines_holdings = $score{ $pines_id }[0]; }
83                         if ($pines_subtitle eq $subtitle) {
84                                 if (! $subtitle_matched) {
85                                         $subtitle_matched = 1;
86                                         $best_pines_id = $pines_id;
87                                         $highest_holdings = -1;
88                                 }
89                         } else {
90                                 if ($subtitle_matched) { next; }        
91                         }
92                         if ( $pines_holdings > $highest_holdings ) {
93                                 $highest_holdings = $pines_holdings;
94                                 $best_pines_id = $pines_id;
95                         }
96                 }
97         }
98         print RECORD_IDS "$best_pines_id\n";
99         if (! defined $match{ $best_pines_id } ) { $match{ $best_pines_id } = [ $best_pines_id ]; }
100         push @{ $match{ $best_pines_id } }, $id;
101 }
102
103
104
105 open GROUPINGS, ">match.groupings";
106 foreach my $k ( keys %match ) {
107
108         print GROUPINGS join("^",
109                 "checking",
110                 $dataset,
111                 $match{ $k }[0],
112                 join(",",@{ $match{ $k } }),
113                 join(",",@{ $match{ $k } })
114         ) . "\n";
115
116 }
117 close GROUPINGS;
118
119