From 25bfd84ab1572e0e26fe938af45d644f6824f954 Mon Sep 17 00:00:00 2001 From: Jason Etheridge Date: Wed, 30 Jul 2008 23:21:26 +0000 Subject: [PATCH] skip problem records --- fingerprints.pl | 250 ++++++++++++++++++++++++++++--------------------------- 1 files changed, 127 insertions(+), 123 deletions(-) diff --git a/fingerprints.pl b/fingerprints.pl index a7d43ee..223bae8 100755 --- a/fingerprints.pl +++ b/fingerprints.pl @@ -19,138 +19,142 @@ foreach $argnum ( 3 .. $#ARGV ) { $batch->strict_off(); $batch->warnings_off(); - while ( my $record = $batch->next() ) { - - $count++; - - my $id = $record->field($id_tag); - if (!$id) { - print STDERR "ERROR: This record is missing a $id_tag field.\n" . $record->as_formatted() . "\n=====\n"; - next; - } - $id = $id->as_string($id_subfield); - print STDERR "WARNINGS: Record id " . $id . " : " . join(":",@warnings) . " : continuing...\n" if ( @warnings ); - - my $leader = $record->leader(); - my $record_type = substr($leader,6,1); - my $bib_lvl = substr($leader,7,1); - - my $my_008 = $record->field('008'); - $my_008 = $my_008->as_string() if ($my_008); - my $date1 = substr($my_008,7,4) if ($my_008); - my $date2 = substr($my_008,11,4) if ($my_008); - my $item_form; - if ( $record_type =~ /[gkroef]/ ) { # MAP, VIS - $item_form = substr($my_008,29,1) if ($my_008); - } else { - $item_form = substr($my_008,23,1) if ($my_008); - } - - my $title = $record->field('245'); - if ( $title ) { $title = $title->subfield('a'); } - - my @isbns = (); - my @isbns_020; if ($record->field('020')) { @isbns_020 = $record->field('020'); } - foreach my $f ( @isbns_020 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } } - my @isbns_024; if ($record->field('024')) { @isbns_024 = $record->field('024'); } - foreach my $f ( @isbns_024 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } } - - my $issn = $record->field('022'); - if ( $issn ) { $issn = $issn->subfield('a'); } - my $lccn = $record->field('010'); - if ( $lccn ) { $lccn = $lccn->subfield('a'); } - my $author; - if ($record->field('100')) { $author = $record->field('100')->subfield('a'); } - if (! $author ) { - if ($record->field('110')) { $author = $record->field('110')->subfield('a'); } - } - if (! $author ) { - if ($record->field('111')) { $author = $record->field('111')->subfield('a'); } - } - my $desc = $record->field('300'); - if ( $desc ) { $desc = $desc->subfield('a'); } - my $pages; - if ($desc =~ /(\d+)/) { $pages = $1; } - my $my_260 = $record->field('260'); - my $publisher = $my_260->subfield('b') if ( $my_260 ); - my $pubyear = $my_260->subfield('c') if ( $my_260 ); - if ( $pubyear ) { - if ( $pubyear =~ /(\d\d\d\d)/ ) { $pubyear = $1; } else { $pubyear = ''; } - } - my $edition = $record->field('250'); - if ( $edition ) { $edition = $edition->subfield('a'); } - - # NORMALIZE - if ($record_type == ' ') { $record_type = 'a'; } - if ($title) { - $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; - $title = lc($title); - $title =~ s/\W+$//go; - } - if ($author) { - $author = NFD($author); $author =~ s/[\x{80}-\x{ffff}]//go; - $author = lc($author); - $author =~ s/\W+$//go; - if ($author =~ /^(\w+)/) { - $author = $1; - } - } - if ($publisher) { - $publisher = NFD($publisher); $publisher =~ s/[\x{80}-\x{ffff}]//go; - $publisher = lc($publisher); - $publisher =~ s/\W+$//go; - if ($publisher =~ /^(\w+)/) { - $publisher = $1; - } - } - - # SPIT OUT FINGERPRINTS FROM THE "LOIS ALGORITHM" - # If we're not getting good matches, we may want to change this. The same thing goes for some other fields. - if ($item_form && ($date1 =~ /\d\d\d\d/) && $record_type && $bib_lvl && $title) { - - if ($which eq "primary") { - print STDOUT join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title) . "\n"; - } else { - - # case a : isbn and pages - if (scalar(@isbns)>0 && $pages) { - foreach my $isbn ( @isbns ) { - print STDOUT join("\t",$id,"case a",$item_form,$date1,$record_type,$bib_lvl,$title,$isbn,$pages) . "\n"; - } + my $record = 1; + while ( $record ) { + eval { + $count++; + $record = $batch->next(); + + my $id = $record->field($id_tag); + if (!$id) { + print STDERR "ERROR: This record is missing a $id_tag field.\n" . $record->as_formatted() . "\n=====\n"; + next; + } + $id = $id->as_string($id_subfield); + print STDERR "WARNINGS: Record id " . $id . " : " . join(":",@warnings) . " : continuing...\n" if ( @warnings ); + + my $leader = $record->leader(); + my $record_type = substr($leader,6,1); + my $bib_lvl = substr($leader,7,1); + + my $my_008 = $record->field('008'); + $my_008 = $my_008->as_string() if ($my_008); + my $date1 = substr($my_008,7,4) if ($my_008); + my $date2 = substr($my_008,11,4) if ($my_008); + my $item_form; + if ( $record_type =~ /[gkroef]/ ) { # MAP, VIS + $item_form = substr($my_008,29,1) if ($my_008); + } else { + $item_form = substr($my_008,23,1) if ($my_008); } - # case b : edition - if ($edition) { - print STDOUT join("\t",$id,"case b",$item_form,$date1,$record_type,$bib_lvl,$title,$edition) . "\n"; + my $title = $record->field('245'); + if ( $title ) { $title = $title->subfield('a'); } + + my @isbns = (); + my @isbns_020; if ($record->field('020')) { @isbns_020 = $record->field('020'); } + foreach my $f ( @isbns_020 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } } + my @isbns_024; if ($record->field('024')) { @isbns_024 = $record->field('024'); } + foreach my $f ( @isbns_024 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } } + + my $issn = $record->field('022'); + if ( $issn ) { $issn = $issn->subfield('a'); } + my $lccn = $record->field('010'); + if ( $lccn ) { $lccn = $lccn->subfield('a'); } + my $author; + if ($record->field('100')) { $author = $record->field('100')->subfield('a'); } + if (! $author ) { + if ($record->field('110')) { $author = $record->field('110')->subfield('a'); } } - - # case c : issn - if ($issn) { - print STDOUT join("\t",$id,"case c",$item_form,$date1,$record_type,$bib_lvl,$title,$issn) . "\n"; + if (! $author ) { + if ($record->field('111')) { $author = $record->field('111')->subfield('a'); } } - - # case d : lccn - if ($lccn) { - print STDOUT join("\t",$id,"case d",$item_form,$date1,$record_type,$bib_lvl,$title,$lccn) . "\n"; + my $desc = $record->field('300'); + if ( $desc ) { $desc = $desc->subfield('a'); } + my $pages; + if ($desc =~ /(\d+)/) { $pages = $1; } + my $my_260 = $record->field('260'); + my $publisher = $my_260->subfield('b') if ( $my_260 ); + my $pubyear = $my_260->subfield('c') if ( $my_260 ); + if ( $pubyear ) { + if ( $pubyear =~ /(\d\d\d\d)/ ) { $pubyear = $1; } else { $pubyear = ''; } } - - # case e : author, publisher, pubyear, pages - if ($author && $publisher && $pubyear && $pages) { - print STDOUT join("\t",$id,"case e",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pages) . "\n"; + my $edition = $record->field('250'); + if ( $edition ) { $edition = $edition->subfield('a'); } + + # NORMALIZE + if ($record_type == ' ') { $record_type = 'a'; } + if ($title) { + $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; + $title = lc($title); + $title =~ s/\W+$//go; + } + if ($author) { + $author = NFD($author); $author =~ s/[\x{80}-\x{ffff}]//go; + $author = lc($author); + $author =~ s/\W+$//go; + if ($author =~ /^(\w+)/) { + $author = $1; + } + } + if ($publisher) { + $publisher = NFD($publisher); $publisher =~ s/[\x{80}-\x{ffff}]//go; + $publisher = lc($publisher); + $publisher =~ s/\W+$//go; + if ($publisher =~ /^(\w+)/) { + $publisher = $1; } - } - } else { - print STDERR "Record " . $id . " did not make the cut: "; - print STDERR "Missing item_form. " unless ($item_form); - print STDERR "Missing valid date1. " unless ($date1 =~ /\d\d\d\d/); - print STDERR "Missing record_type. " unless ($record_type); - print STDERR "Missing bib_lvl. " unless ($bib_lvl); - print STDERR "Missing title. " unless ($title); - print STDERR "\n"; + # SPIT OUT FINGERPRINTS FROM THE "LOIS ALGORITHM" + # If we're not getting good matches, we may want to change this. The same thing goes for some other fields. + if ($item_form && ($date1 =~ /\d\d\d\d/) && $record_type && $bib_lvl && $title) { + + if ($which eq "primary") { + print STDOUT join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title) . "\n"; + } else { + + # case a : isbn and pages + if (scalar(@isbns)>0 && $pages) { + foreach my $isbn ( @isbns ) { + print STDOUT join("\t",$id,"case a",$item_form,$date1,$record_type,$bib_lvl,$title,$isbn,$pages) . "\n"; + } + } + + # case b : edition + if ($edition) { + print STDOUT join("\t",$id,"case b",$item_form,$date1,$record_type,$bib_lvl,$title,$edition) . "\n"; + } + + # case c : issn + if ($issn) { + print STDOUT join("\t",$id,"case c",$item_form,$date1,$record_type,$bib_lvl,$title,$issn) . "\n"; + } + + # case d : lccn + if ($lccn) { + print STDOUT join("\t",$id,"case d",$item_form,$date1,$record_type,$bib_lvl,$title,$lccn) . "\n"; + } + + # case e : author, publisher, pubyear, pages + if ($author && $publisher && $pubyear && $pages) { + print STDOUT join("\t",$id,"case e",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pages) . "\n"; + } + + } - } + } else { + print STDERR "Record " . $id . " did not make the cut: "; + print STDERR "Missing item_form. " unless ($item_form); + print STDERR "Missing valid date1. " unless ($date1 =~ /\d\d\d\d/); + print STDERR "Missing record_type. " unless ($record_type); + print STDERR "Missing bib_lvl. " unless ($bib_lvl); + print STDERR "Missing title. " unless ($title); + print STDERR "\n"; + + } + } + print STDERR "Trapped exception for MARC::Batch->next on record $count: $@\n" if ($@); } print STDERR "Processed $count records\n"; } -- 1.7.2.5