8 use MARC::File::XML ( BinaryEncoding => 'utf-8' );
10 use Unicode::Normalize;
12 my $conf = {}; # configuration hashref
13 my $marc = {}; # MARC record hashref
17 for my $file (@ARGV) {
19 print STDERR "Processing $file\n";
21 open my $M, '<:utf8', $file;
23 my $batch = MARC::Batch->new('XML',$M);
25 $batch->warnings_off();
27 while ( my $record = $batch->next ) {
30 my $id = $record->field($conf->{tag});
32 print STDERR "ERROR: This record is missing a", $conf->{tag},
33 "field.\n", $record->as_formatted(), "\n=====\n";
36 $id = $id->as_string($conf->{subfield});
38 my $leader = $record->leader();
39 my $record_type = substr($leader,6,1);
40 my $bib_lvl = substr($leader,7,1);
42 my $my_008 = $record->field('008');
43 $my_008 = $my_008->as_string() if ($my_008);
44 my $date1 = substr($my_008,7,4) if ($my_008);
45 my $date2 = substr($my_008,11,4) if ($my_008);
48 if ( $record_type =~ /[gkroef]/ ) { # MAP, VIS
49 $item_form = substr($my_008,29,1) if ($my_008);
51 $item_form = substr($my_008,23,1) if ($my_008);
54 my $title = $record->field('245');
55 $title = $title->subfield('a') if $title;
59 if ($record->field('020')) { @isbns_020 = $record->field('020'); }
60 foreach my $f ( @isbns_020 ) {
61 if ($f->subfield('a')) {
62 if ( $f->subfield('a')=~/(\S+)/ ) {
68 if ($record->field('024')) { @isbns_024 = $record->field('024'); }
69 foreach my $f ( @isbns_024 ) {
70 if ($f->subfield('a')) {
71 if ( $f->subfield('a')=~/(\S+)/ ) {
77 my $issn = $record->field('022');
78 $issn = $issn->subfield('a') if $issn;
80 my $lccn = $record->field('010');
81 $lccn = $lccn->subfield('a') if $lccn;
84 if ($record->field('100'))
85 { $author = $record->field('100')->subfield('a'); }
87 $author = $record->field('110')->subfield('a')
88 if ($record->field('110'));
89 $author = $record->field('111')->subfield('a')
90 if ($record->field('111'));
93 my $desc = $record->field('300');
94 $desc = $desc->subfield('a') if $desc;
97 $pages = $1 if (defined $desc and $desc =~ /(\d+)/);
99 my $my_260 = $record->field('260');
100 my $publisher = $my_260->subfield('b') if $my_260;
101 my $pubyear = $my_260->subfield('c') if $my_260;
103 if ( $pubyear =~ /(\d\d\d\d)/ )
109 my $edition = $record->field('250');
110 $edition = $edition->subfield('a') if $edition;
113 $record_type = 'a' if ($record_type eq ' ');
115 $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go;
117 $title =~ s/\W+$//go;
120 $author = NFD($author); $author =~ s/[\x{80}-\x{ffff}]//go;
121 $author = lc($author);
122 $author =~ s/\W+$//go;
123 if ($author =~ /^(\w+)/) {
128 $publisher = NFD($publisher); $publisher =~ s/[\x{80}-\x{ffff}]//go;
129 $publisher = lc($publisher);
130 $publisher =~ s/\W+$//go;
131 if ($publisher =~ /^(\w+)/) {
136 # SPIT OUT FINGERPRINTS FROM THE "LOIS ALGORITHM"
137 # If we're not getting good matches, we may want to change this.
138 # The same thing goes for some other fields.
139 if ($item_form && ($date1 =~ /\d\d\d\d/)
140 && $record_type && $bib_lvl && $title) {
141 if ($conf->{runtype} eq "primary") {
143 join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title)
146 # case a : isbn and pages
147 if (scalar(@isbns)>0 && $pages) {
148 foreach my $isbn ( @isbns ) {
150 join("\t", $id, "case a", $item_form, $date1,
151 $record_type, $bib_lvl, $title, $isbn, $pages)
158 join("\t", $id, "case b", $item_form, $date1,
159 $record_type, $bib_lvl, $title,$edition), "\n";
163 print STDOUT join("\t", $id, "case c", $item_form, $date1,
164 $record_type, $bib_lvl, $title, $issn)
169 print STDOUT join("\t", $id, "case d", $item_form, $date1,
170 $record_type, $bib_lvl, $title, $lccn)
173 # case e : author, publisher, pubyear, pages
174 if ($author && $publisher && $pubyear && $pages) {
175 print STDOUT join("\t", $id, "case e", $item_form, $date1,
176 $record_type, $bib_lvl, $title, $author,
177 $publisher, $pubyear, $pages), "\n";
181 print STDERR "Record " . $id . " did not make the cut: ";
182 print STDERR "Missing item_form. " unless ($item_form);
183 print STDERR "Missing valid date1. " unless ($date1 =~ /\d\d\d\d/);
184 print STDERR "Missing record_type. " unless ($record_type);
185 print STDERR "Missing bib_lvl. " unless ($bib_lvl);
186 print STDERR "Missing title. " unless ($title);
190 print STDERR "Processed $count records\n";
195 Performs boring script initialization. Handles argument parsing,
204 # set mode on existing filehandles
205 binmode(STDOUT, ':utf8');
206 binmode(STDIN, ':utf8');
208 my $rc = GetOptions( $c,
213 show_help() unless $rc;
214 my @keys = keys %{$c};
215 show_help() unless (@ARGV and @keys);
217 for my $key ('runtype', 'tag', 'subfield') {
218 push @missing, $key unless $c->{$key}
221 print "Required option: ", join(', ', @missing), " missing!\n";
232 Usage is: fingerprinter [REQUIRED ARGS] [OPTIONS] <filelist>
234 --runtype=(primary|full) -r Do 'primary' or 'full' fingerprinting
235 --tag=N -t Which tag to use
236 --subfield=X -s Which subfield to use