my $conf = {}; # configuration hashref
my $count = 0;
+my $start = time;
$| = 1;
-initialyze($conf);
+initialize($conf);
open OF, '>', $conf->{output};
binmode(OF, ':utf8');
for my $file (@ARGV) {
print XF "Processing $file\n";
open my $records, '<:utf8', $file;
+ my $batch = undef; my $record = undef;
- my $batch = MARC::Batch->new('XML', $records);
+ $batch = MARC::Batch->new('XML', $records);
$batch->strict_off();
$batch->warnings_off();
- while ( my $record = $batch->next ) {
+ while ( eval { $record = $batch->next } ) {
+ my $marc = undef;
$count++; progress_ticker();
+ unless ( defined $record )
+ { dump_exception($marc); next; }
my $id = $record->field($conf->{tag});
unless ($id) {
- print XF "ERROR: Record $count in $file is missing a",
- $conf->{tag}, "field.\n", $record->as_formatted(), "\n=====\n";
+ print XF "ERROR: Record $count in $file is missing a ",
+ $conf->{tag}, " field.\n", $record->as_formatted(), "\n=====\n";
next;
}
- my $marc = populate_marc($record, $id);
- $marc = normalize_marc($marc);
+ $marc = populate_marc($record, $id);
+ $marc = normalize_marc($marc);
unless (marc_isvalid($marc))
{ dump_exception($marc); next; }
dump_fingerprints($marc);
my $my_008 = $record->field('008');
$my_008 = $my_008->as_string() if ($my_008);
unless (length $my_008 == 40)
- { print XF ">> Bad 008 field length in rec. $id\n"; return $marc }
+ { print XF ">> Bad 008 field length in rec. $id\n"; return \%marc }
$marc{date1} = substr($my_008,7,4) if ($my_008);
$marc{date2} = substr($my_008,11,4) if ($my_008); # UNUSED
}
}
+ # "Accompanying material" (300e)
+ $marc{accomp} = $record->field('300')->subfield('e')
+ if $record->field('300');
+
# issn, lccn, title, desc, pages, pub, pubyear, edition
$marc{lccn} = $record->field('010')->subfield('a') if $record->field('010');
$marc{issn} = $record->field('022')->subfield('a') if $record->field('022');
$marc->{title}, $marc->{lccn}) ,"\n";
}
- # case e : author, publisher, pubyear, pages
+ if ($marc->{accomp}) { # case e : accomp
+ print OF join("\t", $marc->{id}, "case d",
+ $marc->{item_form}, $marc->{date1},
+ $marc->{record_type}, $marc->{bib_lvl},
+ $marc->{title}, $marc->{accomp}) ,"\n";
+ }
+
+ # case z : author, publisher, pubyear, pages
if ($marc->{author} and $marc->{publisher} and $marc->{pubyear}
and $marc->{pages}) {
print OF join("\t", $marc->{id}, "case e",
sub dump_exception {
my ($marc) = @_;
+ unless (defined $marc) {
+ print XF "Undefined record at line $count; likely bad XML\n";
+ return;
+ }
print XF "Record ", $marc->{id}, " did not make the cut: ";
print XF "Missing item_form. " unless ($marc->{item_form});
print XF "Missing date1. " unless (defined $marc->{date1});
print XF "Invalid date1: ", $marc->{date1}
- unless ($marc->{date1} =~ /\d{4}/);
+ unless (defined $marc->{date1} and $marc->{date1} =~ /\d{4}/);
print XF "Missing record_type. " unless ($marc->{record_type});
print XF "Missing bib_lvl. " unless ($marc->{bib_lvl});
print XF "Missing title. " unless ($marc->{title});
}
-=head2 initialyze
+=head2 initialize
Performs boring script initialization. Handles argument parsing,
mostly.
=cut
-sub initialyze {
+sub initialize {
my ($c) = @_;
my @missing = ();
sub progress_ticker {
return if $conf->{quiet};
-
- if ($count % 100 == 0) {
- print '|';
- print " $count \n" unless ($count % 1400);
- } elsif ($count % 20 == 0) {
- print '.';
- }
+ printf("> %d (%d/s)\r", $count, ($count / (time - $start)))
+ if ($count % 100 == 0);
}
=head2 show_help