skip problem records
authorJason Etheridge <jason@esilibrary.com>
Wed, 30 Jul 2008 23:21:26 +0000 (23:21 +0000)
committerJason Etheridge <jason@esilibrary.com>
Wed, 30 Jul 2008 23:21:26 +0000 (23:21 +0000)
fingerprints.pl

index a7d43ee..223bae8 100755 (executable)
@@ -19,138 +19,142 @@ foreach $argnum ( 3 .. $#ARGV ) {
        $batch->strict_off();
        $batch->warnings_off();
 
-       while ( my $record = $batch->next() ) {
-
-        $count++;
-
-               my $id = $record->field($id_tag);
-               if (!$id) {
-                       print STDERR "ERROR: This record is missing a $id_tag field.\n" . $record->as_formatted() . "\n=====\n";
-                       next;
-               }
-               $id = $id->as_string($id_subfield);
-               print STDERR "WARNINGS: Record id " . $id . " : " .  join(":",@warnings) . " : continuing...\n" if ( @warnings );
-
-               my $leader = $record->leader();
-               my $record_type = substr($leader,6,1);
-               my $bib_lvl = substr($leader,7,1);
-
-               my $my_008 = $record->field('008');
-                       $my_008 = $my_008->as_string() if ($my_008);
-               my $date1 = substr($my_008,7,4) if ($my_008);
-               my $date2 = substr($my_008,11,4) if ($my_008);
-               my $item_form;
-                       if ( $record_type =~ /[gkroef]/ ) { # MAP, VIS
-                               $item_form = substr($my_008,29,1) if ($my_008);
-                       } else {
-                               $item_form = substr($my_008,23,1) if ($my_008);
-                       }
-
-               my $title = $record->field('245'); 
-                       if ( $title ) { $title = $title->subfield('a'); }
-        
-        my @isbns = ();
-               my @isbns_020; if ($record->field('020')) { @isbns_020 = $record->field('020'); }
-               foreach my $f ( @isbns_020 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } }
-               my @isbns_024; if ($record->field('024')) { @isbns_024 = $record->field('024'); }
-               foreach my $f ( @isbns_024 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } }
-
-               my $issn = $record->field('022');
-                       if ( $issn ) { $issn = $issn->subfield('a'); }
-               my $lccn = $record->field('010');
-                       if ( $lccn ) { $lccn = $lccn->subfield('a'); }
-               my $author;
-                       if ($record->field('100')) { $author = $record->field('100')->subfield('a'); }
-                       if (! $author ) {
-                               if ($record->field('110')) { $author = $record->field('110')->subfield('a'); }
-                       }
-                       if (! $author ) {
-                               if ($record->field('111')) { $author = $record->field('111')->subfield('a'); }
-                       }
-               my $desc = $record->field('300');
-                       if ( $desc ) { $desc = $desc->subfield('a'); }
-               my $pages;
-                       if ($desc =~ /(\d+)/) { $pages = $1; }
-               my $my_260 = $record->field('260');
-               my $publisher = $my_260->subfield('b') if ( $my_260 );
-               my $pubyear = $my_260->subfield('c') if ( $my_260 );
-                       if ( $pubyear ) { 
-                               if ( $pubyear =~ /(\d\d\d\d)/ ) { $pubyear = $1; } else { $pubyear = ''; }
-                       }
-               my $edition = $record->field('250');
-                       if ( $edition ) { $edition = $edition->subfield('a'); }
-
-               # NORMALIZE
-               if ($record_type == ' ') { $record_type = 'a'; }
-               if ($title) {
-                       $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go;
-                       $title = lc($title);
-                       $title =~ s/\W+$//go;
-               }
-               if ($author) {
-                       $author = NFD($author); $author =~ s/[\x{80}-\x{ffff}]//go;
-                       $author = lc($author);
-                       $author =~ s/\W+$//go;
-                       if ($author =~ /^(\w+)/) {
-                               $author = $1;
-                       }
-               }
-               if ($publisher) {
-                       $publisher = NFD($publisher); $publisher =~ s/[\x{80}-\x{ffff}]//go;
-                       $publisher = lc($publisher);
-                       $publisher =~ s/\W+$//go;
-                       if ($publisher =~ /^(\w+)/) {
-                               $publisher = $1;
-                       }
-               }
-
-               # SPIT OUT FINGERPRINTS FROM THE "LOIS ALGORITHM"
-               # If we're not getting good matches, we may want to change this.  The same thing goes for some other fields.
-               if ($item_form && ($date1 =~ /\d\d\d\d/) && $record_type && $bib_lvl && $title) {
-
-            if ($which eq "primary") {
-                           print STDOUT join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title) . "\n"; 
-            } else {
-                       
-                # case a : isbn and pages
-                if (scalar(@isbns)>0 && $pages) {
-                    foreach my $isbn ( @isbns ) {
-                        print STDOUT join("\t",$id,"case a",$item_form,$date1,$record_type,$bib_lvl,$title,$isbn,$pages) . "\n"; 
-                    }
+    my $record = 1;
+       while ( $record ) {
+        eval {
+            $count++;
+            $record = $batch->next();
+
+            my $id = $record->field($id_tag);
+            if (!$id) {
+                print STDERR "ERROR: This record is missing a $id_tag field.\n" . $record->as_formatted() . "\n=====\n";
+                next;
+            }
+            $id = $id->as_string($id_subfield);
+            print STDERR "WARNINGS: Record id " . $id . " : " .  join(":",@warnings) . " : continuing...\n" if ( @warnings );
+
+            my $leader = $record->leader();
+            my $record_type = substr($leader,6,1);
+            my $bib_lvl = substr($leader,7,1);
+
+            my $my_008 = $record->field('008');
+                $my_008 = $my_008->as_string() if ($my_008);
+            my $date1 = substr($my_008,7,4) if ($my_008);
+            my $date2 = substr($my_008,11,4) if ($my_008);
+            my $item_form;
+                if ( $record_type =~ /[gkroef]/ ) { # MAP, VIS
+                    $item_form = substr($my_008,29,1) if ($my_008);
+                } else {
+                    $item_form = substr($my_008,23,1) if ($my_008);
                 }
 
-                # case b : edition
-                if ($edition) {
-                    print STDOUT join("\t",$id,"case b",$item_form,$date1,$record_type,$bib_lvl,$title,$edition) . "\n"; 
+            my $title = $record->field('245'); 
+                if ( $title ) { $title = $title->subfield('a'); }
+            
+            my @isbns = ();
+            my @isbns_020; if ($record->field('020')) { @isbns_020 = $record->field('020'); }
+            foreach my $f ( @isbns_020 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } }
+            my @isbns_024; if ($record->field('024')) { @isbns_024 = $record->field('024'); }
+            foreach my $f ( @isbns_024 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } }
+
+            my $issn = $record->field('022');
+                if ( $issn ) { $issn = $issn->subfield('a'); }
+            my $lccn = $record->field('010');
+                if ( $lccn ) { $lccn = $lccn->subfield('a'); }
+            my $author;
+                if ($record->field('100')) { $author = $record->field('100')->subfield('a'); }
+                if (! $author ) {
+                    if ($record->field('110')) { $author = $record->field('110')->subfield('a'); }
                 }
-
-                # case c : issn
-                if ($issn) {
-                    print STDOUT join("\t",$id,"case c",$item_form,$date1,$record_type,$bib_lvl,$title,$issn) . "\n"; 
+                if (! $author ) {
+                    if ($record->field('111')) { $author = $record->field('111')->subfield('a'); }
                 }
-
-                # case d : lccn
-                if ($lccn) {
-                    print STDOUT join("\t",$id,"case d",$item_form,$date1,$record_type,$bib_lvl,$title,$lccn) . "\n"; 
+            my $desc = $record->field('300');
+                if ( $desc ) { $desc = $desc->subfield('a'); }
+            my $pages;
+                if ($desc =~ /(\d+)/) { $pages = $1; }
+            my $my_260 = $record->field('260');
+            my $publisher = $my_260->subfield('b') if ( $my_260 );
+            my $pubyear = $my_260->subfield('c') if ( $my_260 );
+                if ( $pubyear ) { 
+                    if ( $pubyear =~ /(\d\d\d\d)/ ) { $pubyear = $1; } else { $pubyear = ''; }
                 }
-
-                # case e : author, publisher, pubyear, pages
-                if ($author && $publisher && $pubyear && $pages) {
-                    print STDOUT join("\t",$id,"case e",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pages) . "\n"; 
+            my $edition = $record->field('250');
+                if ( $edition ) { $edition = $edition->subfield('a'); }
+
+            # NORMALIZE
+            if ($record_type == ' ') { $record_type = 'a'; }
+            if ($title) {
+                $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go;
+                $title = lc($title);
+                $title =~ s/\W+$//go;
+            }
+            if ($author) {
+                $author = NFD($author); $author =~ s/[\x{80}-\x{ffff}]//go;
+                $author = lc($author);
+                $author =~ s/\W+$//go;
+                if ($author =~ /^(\w+)/) {
+                    $author = $1;
+                }
+            }
+            if ($publisher) {
+                $publisher = NFD($publisher); $publisher =~ s/[\x{80}-\x{ffff}]//go;
+                $publisher = lc($publisher);
+                $publisher =~ s/\W+$//go;
+                if ($publisher =~ /^(\w+)/) {
+                    $publisher = $1;
                 }
-
             }
 
-               } else {
-                       print STDERR "Record " . $id . " did not make the cut: ";
-                       print STDERR "Missing item_form. " unless ($item_form);
-                       print STDERR "Missing valid date1. " unless ($date1 =~ /\d\d\d\d/);
-                       print STDERR "Missing record_type. " unless ($record_type);
-                       print STDERR "Missing bib_lvl. " unless ($bib_lvl);
-                       print STDERR "Missing title. " unless ($title);
-                       print STDERR "\n";
+            # SPIT OUT FINGERPRINTS FROM THE "LOIS ALGORITHM"
+            # If we're not getting good matches, we may want to change this.  The same thing goes for some other fields.
+            if ($item_form && ($date1 =~ /\d\d\d\d/) && $record_type && $bib_lvl && $title) {
+
+                if ($which eq "primary") {
+                    print STDOUT join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title) . "\n"; 
+                } else {
+                
+                    # case a : isbn and pages
+                    if (scalar(@isbns)>0 && $pages) {
+                        foreach my $isbn ( @isbns ) {
+                            print STDOUT join("\t",$id,"case a",$item_form,$date1,$record_type,$bib_lvl,$title,$isbn,$pages) . "\n"; 
+                        }
+                    }
+
+                    # case b : edition
+                    if ($edition) {
+                        print STDOUT join("\t",$id,"case b",$item_form,$date1,$record_type,$bib_lvl,$title,$edition) . "\n"; 
+                    }
+
+                    # case c : issn
+                    if ($issn) {
+                        print STDOUT join("\t",$id,"case c",$item_form,$date1,$record_type,$bib_lvl,$title,$issn) . "\n"; 
+                    }
+
+                    # case d : lccn
+                    if ($lccn) {
+                        print STDOUT join("\t",$id,"case d",$item_form,$date1,$record_type,$bib_lvl,$title,$lccn) . "\n"; 
+                    }
+
+                    # case e : author, publisher, pubyear, pages
+                    if ($author && $publisher && $pubyear && $pages) {
+                        print STDOUT join("\t",$id,"case e",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pages) . "\n"; 
+                    }
+
+                }
 
-               }
+            } else {
+                print STDERR "Record " . $id . " did not make the cut: ";
+                print STDERR "Missing item_form. " unless ($item_form);
+                print STDERR "Missing valid date1. " unless ($date1 =~ /\d\d\d\d/);
+                print STDERR "Missing record_type. " unless ($record_type);
+                print STDERR "Missing bib_lvl. " unless ($bib_lvl);
+                print STDERR "Missing title. " unless ($title);
+                print STDERR "\n";
+
+            }
+        }
+        print STDERR "Trapped exception for MARC::Batch->next on record $count: $@\n" if ($@);
        }
     print STDERR "Processed $count records\n";
 }