Example: spit_csv.pl marc.xml 999 a 903 a ; ### produces a tab-delimited file, one...

author Jason Etheridge <jason@esilibrary.com>

Thu, 12 Jun 2008 16:31:29 +0000 (16:31 +0000)

committer Jason Etheridge <jason@esilibrary.com>

Thu, 12 Jun 2008 16:31:29 +0000 (16:31 +0000)
author Jason Etheridge <jason@esilibrary.com>
Thu, 12 Jun 2008 16:31:29 +0000 (16:31 +0000)
committer Jason Etheridge <jason@esilibrary.com>
Thu, 12 Jun 2008 16:31:29 +0000 (16:31 +0000)
diff --git a/fingerprints.pl b/fingerprints.pl

index a7d43ee..8b78668 100755 (executable)
--- a/fingerprints.pl
+++ b/fingerprints.pl
@@ -46,8 +46,39 @@ foreach $argnum ( 3 .. $#ARGV ) {
                                $item_form = substr($my_008,23,1) if ($my_008);
                        }
 
-               my $title = $record->field('245'); 
-                       if ( $title ) { $title = $title->subfield('a'); }
+        my @titles = ();
+               my $my_245 = $record->field('245'); 
+                       if ( $my_245 ) { 
+                my $title = $my_245->subfield('a');
+                $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+                if ($my_245->subfield('b')) {
+                    $title = $my_245->subfield('a') . ', ' . $my_245->subfield('b');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+
+                    $title = "_magic_prefix_for_special_case_1_" .$my_245->subfield('b');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+                }
+                if ($title->subfield('p')) {
+                    $title = $my_245->subfield('a') . ', ' . $my_245->subfield('p');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+                }
+                my $my_440 = $record->field('440');
+                if ($my_440 && $my_440->subfield('a')) {
+                    $title = $my_440->subfield('a') . ', ' . $my_245->subfield('a');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+
+                    $title = "_magic_prefix_for_special_case_1_" .$my_245->subfield('a');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+                }
+                my $my_490 = $record->field('490');
+                if ($my_490 && $my_490->subfield('a')) {
+                    $title = $my_490->subfield('a') . ', ' . $my_245->subfield('a');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+
+                    $title = "_magic_prefix_for_special_case_1_" .$my_245->subfield('a');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+                }
+            }
         
         my @isbns = ();
                my @isbns_020; if ($record->field('020')) { @isbns_020 = $record->field('020'); }
@@ -69,8 +100,8 @@ foreach $argnum ( 3 .. $#ARGV ) {
                        }
                my $desc = $record->field('300');
                        if ( $desc ) { $desc = $desc->subfield('a'); }
-               my $pages;
-                       if ($desc =~ /(\d+)/) { $pages = $1; }
+               my $pagination;
+                       if ($desc =~ /(\d+)/) { $pagination = $1; }
                my $my_260 = $record->field('260');
                my $publisher = $my_260->subfield('b') if ( $my_260 );
                my $pubyear = $my_260->subfield('c') if ( $my_260 );
@@ -82,11 +113,6 @@ foreach $argnum ( 3 .. $#ARGV ) {
 
                # NORMALIZE
                if ($record_type == ' ') { $record_type = 'a'; }
-               if ($title) {
-                       $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go;
-                       $title = lc($title);
-                       $title =~ s/\W+$//go;
-               }
                if ($author) {
                        $author = NFD($author); $author =~ s/[\x{80}-\x{ffff}]//go;
                        $author = lc($author);
@@ -104,39 +130,34 @@ foreach $argnum ( 3 .. $#ARGV ) {
                        }
                }
 
-               # SPIT OUT FINGERPRINTS FROM THE "LOIS ALGORITHM"
+               # SPIT OUT FINGERPRINTS FROM THE "MODIFIED LOIS ALGORITHM"
                # If we're not getting good matches, we may want to change this.  The same thing goes for some other fields.
-               if ($item_form && ($date1 =~ /\d\d\d\d/) && $record_type && $bib_lvl && $title) {
+               if ($item_form && ($date1 =~ /\d\d\d\d/) && $record_type && $bib_lvl && $title && $author && $publisher && $pubyear && $pagination) {
 
             if ($which eq "primary") {
-                           print STDOUT join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title) . "\n"; 
+                print STDOUT join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination) . "\n"; 
             } else {
                        
-                # case a : isbn and pages
-                if (scalar(@isbns)>0 && $pages) {
+                # case a : isbn 
+                if (scalar(@isbns)>0) {
                     foreach my $isbn ( @isbns ) {
-                        print STDOUT join("\t",$id,"case a",$item_form,$date1,$record_type,$bib_lvl,$title,$isbn,$pages) . "\n"; 
+                        print STDOUT join("\t",$id,"case a",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$isbn) . "\n"; 
                     }
                 }
 
                 # case b : edition
                 if ($edition) {
-                    print STDOUT join("\t",$id,"case b",$item_form,$date1,$record_type,$bib_lvl,$title,$edition) . "\n"; 
+                    print STDOUT join("\t",$id,"case b",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$edition) . "\n"; 
                 }
 
                 # case c : issn
                 if ($issn) {
-                    print STDOUT join("\t",$id,"case c",$item_form,$date1,$record_type,$bib_lvl,$title,$issn) . "\n"; 
+                    print STDOUT join("\t",$id,"case c",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$issn) . "\n"; 
                 }
 
                 # case d : lccn
                 if ($lccn) {
-                    print STDOUT join("\t",$id,"case d",$item_form,$date1,$record_type,$bib_lvl,$title,$lccn) . "\n"; 
-                }
-
-                # case e : author, publisher, pubyear, pages
-                if ($author && $publisher && $pubyear && $pages) {
-                    print STDOUT join("\t",$id,"case e",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pages) . "\n"; 
+                    print STDOUT join("\t",$id,"case d",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$lccn) . "\n"; 
                 }
 
             }
@@ -148,6 +169,10 @@ foreach $argnum ( 3 .. $#ARGV ) {
                        print STDERR "Missing record_type. " unless ($record_type);
                        print STDERR "Missing bib_lvl. " unless ($bib_lvl);
                        print STDERR "Missing title. " unless ($title);
+                       print STDERR "Missing author. " unless ($author);
+                       print STDERR "Missing publisher. " unless ($publisher);
+                       print STDERR "Missing pubyear. " unless ($pubyear);
+                       print STDERR "Missing pagination. " unless ($pagination);
                        print STDERR "\n";
 
                }
diff --git a/spit_csv.pl b/spit_csv.pl

new file mode 100755 (executable)

index 0000000..b0da245
--- /dev/null
+++ b/spit_csv.pl
@@ -0,0 +1,44 @@
+#!/usr/bin/perl
+use MARC::Batch;
+use MARC::File::XML ( BinaryEncoding => 'utf-8' );
+use MARC::Field;
+use Unicode::Normalize;
+
+
+my @desired_tags_subfields = ();
+foreach my $argnum ( 1 .. $#ARGV) {
+    push @desired_tags_subfields, $ARGV[$argnum];
+}
+
+my $count = 0;
+
+binmode(STDOUT, ':utf8');
+binmode(STDIN, ':utf8');
+
+foreach my $argnum ( 0 .. 0 ) {
+
+       print STDERR "Processing " . $ARGV[$argnum] . "\n";
+
+       my $batch = MARC::Batch->new('XML',$ARGV[$argnum]);
+       $batch->strict_off();
+       $batch->warnings_off();
+
+       while ( my $record = $batch->next() ) {
+
+        $count++;
+
+               print STDERR "WARNINGS: Record $count : " .  join(":",@warnings) . " : continuing...\n" if ( @warnings );
+
+        for (my $i = 0; $i < scalar(@desired_tags_subfields); $i+=2) {
+                   my @tags; if ($record->field($desired_tags_subfields[$i])) { @tags = $record->field($desired_tags_subfields[$i]); }
+            foreach my $f ( @tags ) { 
+                if ($f->subfield($desired_tags_subfields[$i+1])) { 
+                    print STDOUT $f->subfield($desired_tags_subfields[$i+1]) . "\t";
+                } 
+            }
+        }
+        print STDOUT "\n";
+
+       }
+       print STDERR "Processed $count records\n";
+}
author	Jason Etheridge <jason@esilibrary.com>
	Thu, 12 Jun 2008 16:31:29 +0000 (16:31 +0000)
committer	Jason Etheridge <jason@esilibrary.com>
	Thu, 12 Jun 2008 16:31:29 +0000 (16:31 +0000)
fingerprints.pl		patch \| blob \| history
spit_csv.pl	[new file with mode: 0755]	patch \| blob