Example: spit_csv.pl marc.xml 999 a 903 a ; ### produces a tab-delimited file, one...
[migration-tools.git] / fingerprints.pl
1 #!/usr/bin/perl
2 use MARC::Batch;
3 use MARC::File::XML ( BinaryEncoding => 'utf-8' );
4 use MARC::Field;
5 use Unicode::Normalize;
6
7 my $count = 0; 
8 my $which = $ARGV[0];
9 my $id_tag = $ARGV[1]; my $id_subfield = $ARGV[2];
10
11 binmode(STDOUT, ':utf8');
12 binmode(STDIN, ':utf8');
13
14 foreach $argnum ( 3 .. $#ARGV ) {
15
16         print STDERR "Processing " . $ARGV[$argnum] . "\n";
17
18         my $batch = MARC::Batch->new('XML',$ARGV[$argnum]);
19         $batch->strict_off();
20         $batch->warnings_off();
21
22         while ( my $record = $batch->next() ) {
23
24         $count++;
25
26                 my $id = $record->field($id_tag);
27                 if (!$id) {
28                         print STDERR "ERROR: This record is missing a $id_tag field.\n" . $record->as_formatted() . "\n=====\n";
29                         next;
30                 }
31                 $id = $id->as_string($id_subfield);
32                 print STDERR "WARNINGS: Record id " . $id . " : " .  join(":",@warnings) . " : continuing...\n" if ( @warnings );
33
34                 my $leader = $record->leader();
35                 my $record_type = substr($leader,6,1);
36                 my $bib_lvl = substr($leader,7,1);
37
38                 my $my_008 = $record->field('008');
39                         $my_008 = $my_008->as_string() if ($my_008);
40                 my $date1 = substr($my_008,7,4) if ($my_008);
41                 my $date2 = substr($my_008,11,4) if ($my_008);
42                 my $item_form;
43                         if ( $record_type =~ /[gkroef]/ ) { # MAP, VIS
44                                 $item_form = substr($my_008,29,1) if ($my_008);
45                         } else {
46                                 $item_form = substr($my_008,23,1) if ($my_008);
47                         }
48
49         my @titles = ();
50                 my $my_245 = $record->field('245'); 
51                         if ( $my_245 ) { 
52                 my $title = $my_245->subfield('a');
53                 $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
54                 if ($my_245->subfield('b')) {
55                     $title = $my_245->subfield('a') . ', ' . $my_245->subfield('b');
56                     $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
57
58                     $title = "_magic_prefix_for_special_case_1_" .$my_245->subfield('b');
59                     $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
60                 }
61                 if ($title->subfield('p')) {
62                     $title = $my_245->subfield('a') . ', ' . $my_245->subfield('p');
63                     $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
64                 }
65                 my $my_440 = $record->field('440');
66                 if ($my_440 && $my_440->subfield('a')) {
67                     $title = $my_440->subfield('a') . ', ' . $my_245->subfield('a');
68                     $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
69
70                     $title = "_magic_prefix_for_special_case_1_" .$my_245->subfield('a');
71                     $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
72                 }
73                 my $my_490 = $record->field('490');
74                 if ($my_490 && $my_490->subfield('a')) {
75                     $title = $my_490->subfield('a') . ', ' . $my_245->subfield('a');
76                     $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
77
78                     $title = "_magic_prefix_for_special_case_1_" .$my_245->subfield('a');
79                     $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
80                 }
81             }
82         
83         my @isbns = ();
84                 my @isbns_020; if ($record->field('020')) { @isbns_020 = $record->field('020'); }
85                 foreach my $f ( @isbns_020 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } }
86                 my @isbns_024; if ($record->field('024')) { @isbns_024 = $record->field('024'); }
87                 foreach my $f ( @isbns_024 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } }
88
89                 my $issn = $record->field('022');
90                         if ( $issn ) { $issn = $issn->subfield('a'); }
91                 my $lccn = $record->field('010');
92                         if ( $lccn ) { $lccn = $lccn->subfield('a'); }
93                 my $author;
94                         if ($record->field('100')) { $author = $record->field('100')->subfield('a'); }
95                         if (! $author ) {
96                                 if ($record->field('110')) { $author = $record->field('110')->subfield('a'); }
97                         }
98                         if (! $author ) {
99                                 if ($record->field('111')) { $author = $record->field('111')->subfield('a'); }
100                         }
101                 my $desc = $record->field('300');
102                         if ( $desc ) { $desc = $desc->subfield('a'); }
103                 my $pagination;
104                         if ($desc =~ /(\d+)/) { $pagination = $1; }
105                 my $my_260 = $record->field('260');
106                 my $publisher = $my_260->subfield('b') if ( $my_260 );
107                 my $pubyear = $my_260->subfield('c') if ( $my_260 );
108                         if ( $pubyear ) { 
109                                 if ( $pubyear =~ /(\d\d\d\d)/ ) { $pubyear = $1; } else { $pubyear = ''; }
110                         }
111                 my $edition = $record->field('250');
112                         if ( $edition ) { $edition = $edition->subfield('a'); }
113
114                 # NORMALIZE
115                 if ($record_type == ' ') { $record_type = 'a'; }
116                 if ($author) {
117                         $author = NFD($author); $author =~ s/[\x{80}-\x{ffff}]//go;
118                         $author = lc($author);
119                         $author =~ s/\W+$//go;
120                         if ($author =~ /^(\w+)/) {
121                                 $author = $1;
122                         }
123                 }
124                 if ($publisher) {
125                         $publisher = NFD($publisher); $publisher =~ s/[\x{80}-\x{ffff}]//go;
126                         $publisher = lc($publisher);
127                         $publisher =~ s/\W+$//go;
128                         if ($publisher =~ /^(\w+)/) {
129                                 $publisher = $1;
130                         }
131                 }
132
133                 # SPIT OUT FINGERPRINTS FROM THE "MODIFIED LOIS ALGORITHM"
134                 # If we're not getting good matches, we may want to change this.  The same thing goes for some other fields.
135                 if ($item_form && ($date1 =~ /\d\d\d\d/) && $record_type && $bib_lvl && $title && $author && $publisher && $pubyear && $pagination) {
136
137             if ($which eq "primary") {
138                 print STDOUT join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination) . "\n"; 
139             } else {
140                         
141                 # case a : isbn 
142                 if (scalar(@isbns)>0) {
143                     foreach my $isbn ( @isbns ) {
144                         print STDOUT join("\t",$id,"case a",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$isbn) . "\n"; 
145                     }
146                 }
147
148                 # case b : edition
149                 if ($edition) {
150                     print STDOUT join("\t",$id,"case b",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$edition) . "\n"; 
151                 }
152
153                 # case c : issn
154                 if ($issn) {
155                     print STDOUT join("\t",$id,"case c",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$issn) . "\n"; 
156                 }
157
158                 # case d : lccn
159                 if ($lccn) {
160                     print STDOUT join("\t",$id,"case d",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$lccn) . "\n"; 
161                 }
162
163             }
164
165                 } else {
166                         print STDERR "Record " . $id . " did not make the cut: ";
167                         print STDERR "Missing item_form. " unless ($item_form);
168                         print STDERR "Missing valid date1. " unless ($date1 =~ /\d\d\d\d/);
169                         print STDERR "Missing record_type. " unless ($record_type);
170                         print STDERR "Missing bib_lvl. " unless ($bib_lvl);
171                         print STDERR "Missing title. " unless ($title);
172                         print STDERR "Missing author. " unless ($author);
173                         print STDERR "Missing publisher. " unless ($publisher);
174                         print STDERR "Missing pubyear. " unless ($pubyear);
175                         print STDERR "Missing pagination. " unless ($pagination);
176                         print STDERR "\n";
177
178                 }
179         }
180     print STDERR "Processed $count records\n";
181 }