my $conf = {}; # configuration hashref
my $count = 0;
+my $start = time;
$| = 1;
-initialyze($conf);
+initialize($conf);
open OF, '>', $conf->{output};
binmode(OF, ':utf8');
for my $file (@ARGV) {
print XF "Processing $file\n";
open my $records, '<:utf8', $file;
+ my $batch = undef; my $record = undef;
- my $batch = MARC::Batch->new('XML', $records);
+ $batch = MARC::Batch->new('XML', $records);
$batch->strict_off();
$batch->warnings_off();
- while ( my $record = $batch->next ) {
+ while ( eval { $record = $batch->next } ) {
+ my $marc = undef;
$count++; progress_ticker();
+ unless ( defined $record )
+ { dump_exception($marc); next; }
my $id = $record->field($conf->{tag});
unless ($id) {
- print XF "ERROR: Record $count in $file is missing a",
- $conf->{tag}, "field.\n", $record->as_formatted(), "\n=====\n";
- next;
- }
-
- my $marc = populate_marc($record, $id);
- $marc = normalize_marc($marc);
-
- unless ($marc->{item_form} and ($marc->{date1} =~ /\d{4}/) and
- $marc->{record_type} and $marc->{bib_lvl} and $marc->{title}) {
- print XF "Record ", $marc->{id}, " did not make the cut: ";
- print XF "Missing item_form. " unless ($marc->{item_form});
- print XF "Missing valid date1. "
- unless (defined $marc->{date1} and $marc->{date1} =~ /\d{4}/);
- print XF "Missing record_type. " unless ($marc->{record_type});
- print XF "Missing bib_lvl. " unless ($marc->{bib_lvl});
- print XF "Missing title. " unless ($marc->{title});
- print XF "\n";
+ print XF "ERROR: Record $count in $file is missing a ",
+ $conf->{tag}, " field.\n", $record->as_formatted(), "\n=====\n";
next;
}
+ $marc = populate_marc($record, $id);
+ $marc = normalize_marc($marc);
+ unless (marc_isvalid($marc))
+ { dump_exception($marc); next; }
dump_fingerprints($marc);
}
}
# date1, date2
my $my_008 = $record->field('008');
$my_008 = $my_008->as_string() if ($my_008);
+ unless (length $my_008 == 40)
+ { print XF ">> Bad 008 field length in rec. $id\n"; return \%marc }
$marc{date1} = substr($my_008,7,4) if ($my_008);
$marc{date2} = substr($my_008,11,4) if ($my_008); # UNUSED
}
}
+ # "Accompanying material" (300e)
+ $marc{accomp} = $record->field('300')->subfield('e')
+ if $record->field('300');
+
# issn, lccn, title, desc, pages, pub, pubyear, edition
$marc{lccn} = $record->field('010')->subfield('a') if $record->field('010');
$marc{issn} = $record->field('022')->subfield('a') if $record->field('022');
+=head2 marc_isvalid
+
+Checks MARC record to see if neccessary fingerprinting data is
+available
+
+=cut
+
+sub marc_isvalid {
+ my ($marc) = @_;
+ return 1 if ($marc->{item_form} and ($marc->{date1} =~ /\d{4}/) and
+ $marc->{record_type} and $marc->{bib_lvl} and $marc->{title});
+ return 0;
+}
+
+
=head2 dump_fingerprints
=cut
$marc->{title}, $marc->{lccn}) ,"\n";
}
- # case e : author, publisher, pubyear, pages
+ if ($marc->{accomp}) { # case e : accomp
+ print OF join("\t", $marc->{id}, "case d",
+ $marc->{item_form}, $marc->{date1},
+ $marc->{record_type}, $marc->{bib_lvl},
+ $marc->{title}, $marc->{accomp}) ,"\n";
+ }
+
+ # case z : author, publisher, pubyear, pages
if ($marc->{author} and $marc->{publisher} and $marc->{pubyear}
and $marc->{pages}) {
print OF join("\t", $marc->{id}, "case e",
}
-=head2 initialyze
+
+=head2 dump_exception
+
+Write line of exception report
+
+=cut
+
+sub dump_exception {
+ my ($marc) = @_;
+ unless (defined $marc) {
+ print XF "Undefined record at line $count; likely bad XML\n";
+ return;
+ }
+ print XF "Record ", $marc->{id}, " did not make the cut: ";
+ print XF "Missing item_form. " unless ($marc->{item_form});
+ print XF "Missing date1. " unless (defined $marc->{date1});
+ print XF "Invalid date1: ", $marc->{date1}
+ unless (defined $marc->{date1} and $marc->{date1} =~ /\d{4}/);
+ print XF "Missing record_type. " unless ($marc->{record_type});
+ print XF "Missing bib_lvl. " unless ($marc->{bib_lvl});
+ print XF "Missing title. " unless ($marc->{title});
+ print XF "\n";
+}
+
+
+=head2 initialize
Performs boring script initialization. Handles argument parsing,
mostly.
=cut
-sub initialyze {
+sub initialize {
my ($c) = @_;
my @missing = ();
sub progress_ticker {
return if $conf->{quiet};
-
- if ($count % 100 == 0) {
- print '|';
- print " $count \n" unless ($count % 1400);
- } elsif ($count % 20 == 0) {
- print '.';
- }
+ printf("> %d (%d/s)\r", $count, ($count / (time - $start)))
+ if ($count % 100 == 0);
}
=head2 show_help