fingerprints.pl

   1 #!/usr/bin/perl
   2 use MARC::Batch;
   3 use MARC::File::XML ( BinaryEncoding => 'utf-8' );
   4 use MARC::Field;
   5 use Unicode::Normalize;
   6
   7 my $count = 0;
   8 my $which = $ARGV[0];
   9 my $id_tag = $ARGV[1]; my $id_subfield = $ARGV[2];
  10
  11 binmode(STDOUT, ':utf8');
  12 binmode(STDIN, ':utf8');
  13
  14 foreach $argnum ( 3 .. $#ARGV ) {
  15
  16         print STDERR "Processing " . $ARGV[$argnum] . "\n";
  17
  18         my $batch = MARC::Batch->new('XML',$ARGV[$argnum]);
  19         $batch->strict_off();
  20         $batch->warnings_off();
  21
  22         while ( my $record = $batch->next() ) {
  23
  24         $count++;
  25
  26                 my $id = $record->field($id_tag);
  27                 if (!$id) {
  28                         print STDERR "ERROR: This record is missing a $id_tag field.\n" . $record->as_formatted() . "\n=====\n";
  29                         next;
  30                 }
  31                 $id = $id->as_string($id_subfield);
  32                 print STDERR "WARNINGS: Record id " . $id . " : " .  join(":",@warnings) . " : continuing...\n" if ( @warnings );
  33
  34                 my $leader = $record->leader();
  35                 my $record_type = substr($leader,6,1);
  36                 my $bib_lvl = substr($leader,7,1);
  37
  38                 my $my_008 = $record->field('008');
  39                         $my_008 = $my_008->as_string() if ($my_008);
  40                 my $date1 = substr($my_008,7,4) if ($my_008);
  41                 my $date2 = substr($my_008,11,4) if ($my_008);
  42                 my $item_form;
  43                         if ( $record_type =~ /[gkroef]/ ) { # MAP, VIS
  44                                 $item_form = substr($my_008,29,1) if ($my_008);
  45                         } else {
  46                                 $item_form = substr($my_008,23,1) if ($my_008);
  47                         }
  48
  49                 my $title = $record->field('245');
  50                         if ( $title ) { $title = $title->subfield('a'); }
  51
  52         my @isbns = ();
  53                 my @isbns_020; if ($record->field('020')) { @isbns_020 = $record->field('020'); }
  54                 foreach my $f ( @isbns_020 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } }
  55                 my @isbns_024; if ($record->field('024')) { @isbns_024 = $record->field('024'); }
  56                 foreach my $f ( @isbns_024 ) { if ($f->subfield('a')) { if ( $f->subfield('a')=~/(\S+)/ ) { push @isbns, $1; } } }
  57
  58                 my $issn = $record->field('022');
  59                         if ( $issn ) { $issn = $issn->subfield('a'); }
  60                 my $lccn = $record->field('010');
  61                         if ( $lccn ) { $lccn = $lccn->subfield('a'); }
  62                 my $author;
  63                         if ($record->field('100')) { $author = $record->field('100')->subfield('a'); }
  64                         if (! $author ) {
  65                                 if ($record->field('110')) { $author = $record->field('110')->subfield('a'); }
  66                         }
  67                         if (! $author ) {
  68                                 if ($record->field('111')) { $author = $record->field('111')->subfield('a'); }
  69                         }
  70                 my $desc = $record->field('300');
  71                         if ( $desc ) { $desc = $desc->subfield('a'); }
  72                 my $pages;
  73                         if ($desc =~ /(\d+)/) { $pages = $1; }
  74                 my $my_260 = $record->field('260');
  75                 my $publisher = $my_260->subfield('b') if ( $my_260 );
  76                 my $pubyear = $my_260->subfield('c') if ( $my_260 );
  77                         if ( $pubyear ) {
  78                                 if ( $pubyear =~ /(\d\d\d\d)/ ) { $pubyear = $1; } else { $pubyear = ''; }
  79                         }
  80                 my $edition = $record->field('250');
  81                         if ( $edition ) { $edition = $edition->subfield('a'); }
  82
  83                 # NORMALIZE
  84                 if ($record_type == ' ') { $record_type = 'a'; }
  85                 if ($title) {
  86                         $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go;
  87                         $title = lc($title);
  88                         $title =~ s/\W+$//go;
  89                 }
  90                 if ($author) {
  91                         $author = NFD($author); $author =~ s/[\x{80}-\x{ffff}]//go;
  92                         $author = lc($author);
  93                         $author =~ s/\W+$//go;
  94                         if ($author =~ /^(\w+)/) {
  95                                 $author = $1;
  96                         }
  97                 }
  98                 if ($publisher) {
  99                         $publisher = NFD($publisher); $publisher =~ s/[\x{80}-\x{ffff}]//go;
 100                         $publisher = lc($publisher);
 101                         $publisher =~ s/\W+$//go;
 102                         if ($publisher =~ /^(\w+)/) {
 103                                 $publisher = $1;
 104                         }
 105                 }
 106
 107                 # SPIT OUT FINGERPRINTS FROM THE "LOIS ALGORITHM"
 108                 # If we're not getting good matches, we may want to change this.  The same thing goes for some other fields.
 109                 if ($item_form && ($date1 =~ /\d\d\d\d/) && $record_type && $bib_lvl && $title) {
 110
 111             if ($which eq "primary") {
 112                             print STDOUT join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title) . "\n";
 113             } else {
 114
 115                 # case a : isbn and pages
 116                 if (scalar(@isbns)>0 && $pages) {
 117                     foreach my $isbn ( @isbns ) {
 118                         print STDOUT join("\t",$id,"case a",$item_form,$date1,$record_type,$bib_lvl,$title,$isbn,$pages) . "\n";
 119                     }
 120                 }
 121
 122                 # case b : edition
 123                 if ($edition) {
 124                     print STDOUT join("\t",$id,"case b",$item_form,$date1,$record_type,$bib_lvl,$title,$edition) . "\n";
 125                 }
 126
 127                 # case c : issn
 128                 if ($issn) {
 129                     print STDOUT join("\t",$id,"case c",$item_form,$date1,$record_type,$bib_lvl,$title,$issn) . "\n";
 130                 }
 131
 132                 # case d : lccn
 133                 if ($lccn) {
 134                     print STDOUT join("\t",$id,"case d",$item_form,$date1,$record_type,$bib_lvl,$title,$lccn) . "\n";
 135                 }
 136
 137                 # case e : author, publisher, pubyear, pages
 138                 if ($author && $publisher && $pubyear && $pages) {
 139                     print STDOUT join("\t",$id,"case e",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pages) . "\n";
 140                 }
 141
 142             }
 143
 144                 } else {
 145                         print STDERR "Record " . $id . " did not make the cut: ";
 146                         print STDERR "Missing item_form. " unless ($item_form);
 147                         print STDERR "Missing valid date1. " unless ($date1 =~ /\d\d\d\d/);
 148                         print STDERR "Missing record_type. " unless ($record_type);
 149                         print STDERR "Missing bib_lvl. " unless ($bib_lvl);
 150                         print STDERR "Missing title. " unless ($title);
 151                         print STDERR "\n";
 152
 153                 }
 154         }
 155     print STDERR "Processed $count records\n";
 156 }