From de9de34628637b78668e4ac358971d05cfb6b87f Mon Sep 17 00:00:00 2001
From: Jason Etheridge <jason@esilibrary.com>
Date: Thu, 12 Jun 2008 16:31:29 +0000
Subject: [PATCH] Example: spit_csv.pl marc.xml 999 a 903 a ; ### produces a tab-delimited file, one line per record, with 999a as the first column and 903a as the second column

---
 fingerprints.pl |   71 +++++++++++++++++++++++++++++++++++++-----------------
 spit_csv.pl     |   44 ++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+), 23 deletions(-)
 create mode 100755 spit_csv.pl

diff --git a/fingerprints.pl b/fingerprints.pl
index a7d43ee..8b78668 100755
--- a/fingerprints.pl
+++ b/fingerprints.pl
@@ -46,8 +46,39 @@ foreach $argnum ( 3 .. $#ARGV ) {
 				$item_form = substr($my_008,23,1) if ($my_008);
 			}
 
-		my $title = $record->field('245'); 
-			if ( $title ) { $title = $title->subfield('a'); }
+        my @titles = ();
+		my $my_245 = $record->field('245'); 
+			if ( $my_245 ) { 
+                my $title = $my_245->subfield('a');
+                $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+                if ($my_245->subfield('b')) {
+                    $title = $my_245->subfield('a') . ', ' . $my_245->subfield('b');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+
+                    $title = "_magic_prefix_for_special_case_1_" .$my_245->subfield('b');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+                }
+                if ($title->subfield('p')) {
+                    $title = $my_245->subfield('a') . ', ' . $my_245->subfield('p');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+                }
+                my $my_440 = $record->field('440');
+                if ($my_440 && $my_440->subfield('a')) {
+                    $title = $my_440->subfield('a') . ', ' . $my_245->subfield('a');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+
+                    $title = "_magic_prefix_for_special_case_1_" .$my_245->subfield('a');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+                }
+                my $my_490 = $record->field('490');
+                if ($my_490 && $my_490->subfield('a')) {
+                    $title = $my_490->subfield('a') . ', ' . $my_245->subfield('a');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+
+                    $title = "_magic_prefix_for_special_case_1_" .$my_245->subfield('a');
+                    $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title;
+                }
+            }
         
         my @isbns = ();
 		my @isbns_020; if ($record->field('020')) { @isbns_020 = $record->field('020'); }
@@ -69,8 +100,8 @@ foreach $argnum ( 3 .. $#ARGV ) {
 			}
 		my $desc = $record->field('300');
 			if ( $desc ) { $desc = $desc->subfield('a'); }
-		my $pages;
-			if ($desc =~ /(\d+)/) { $pages = $1; }
+		my $pagination;
+			if ($desc =~ /(\d+)/) { $pagination = $1; }
 		my $my_260 = $record->field('260');
 		my $publisher = $my_260->subfield('b') if ( $my_260 );
 		my $pubyear = $my_260->subfield('c') if ( $my_260 );
@@ -82,11 +113,6 @@ foreach $argnum ( 3 .. $#ARGV ) {
 
 		# NORMALIZE
 		if ($record_type == ' ') { $record_type = 'a'; }
-		if ($title) {
-			$title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go;
-			$title = lc($title);
-			$title =~ s/\W+$//go;
-		}
 		if ($author) {
 			$author = NFD($author); $author =~ s/[\x{80}-\x{ffff}]//go;
 			$author = lc($author);
@@ -104,39 +130,34 @@ foreach $argnum ( 3 .. $#ARGV ) {
 			}
 		}
 
-		# SPIT OUT FINGERPRINTS FROM THE "LOIS ALGORITHM"
+		# SPIT OUT FINGERPRINTS FROM THE "MODIFIED LOIS ALGORITHM"
 		# If we're not getting good matches, we may want to change this.  The same thing goes for some other fields.
-		if ($item_form && ($date1 =~ /\d\d\d\d/) && $record_type && $bib_lvl && $title) {
+		if ($item_form && ($date1 =~ /\d\d\d\d/) && $record_type && $bib_lvl && $title && $author && $publisher && $pubyear && $pagination) {
 
             if ($which eq "primary") {
-			    print STDOUT join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title) . "\n"; 
+                print STDOUT join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination) . "\n"; 
             } else {
 			
-                # case a : isbn and pages
-                if (scalar(@isbns)>0 && $pages) {
+                # case a : isbn 
+                if (scalar(@isbns)>0) {
                     foreach my $isbn ( @isbns ) {
-                        print STDOUT join("\t",$id,"case a",$item_form,$date1,$record_type,$bib_lvl,$title,$isbn,$pages) . "\n"; 
+                        print STDOUT join("\t",$id,"case a",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$isbn) . "\n"; 
                     }
                 }
 
                 # case b : edition
                 if ($edition) {
-                    print STDOUT join("\t",$id,"case b",$item_form,$date1,$record_type,$bib_lvl,$title,$edition) . "\n"; 
+                    print STDOUT join("\t",$id,"case b",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$edition) . "\n"; 
                 }
 
                 # case c : issn
                 if ($issn) {
-                    print STDOUT join("\t",$id,"case c",$item_form,$date1,$record_type,$bib_lvl,$title,$issn) . "\n"; 
+                    print STDOUT join("\t",$id,"case c",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$issn) . "\n"; 
                 }
 
                 # case d : lccn
                 if ($lccn) {
-                    print STDOUT join("\t",$id,"case d",$item_form,$date1,$record_type,$bib_lvl,$title,$lccn) . "\n"; 
-                }
-
-                # case e : author, publisher, pubyear, pages
-                if ($author && $publisher && $pubyear && $pages) {
-                    print STDOUT join("\t",$id,"case e",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pages) . "\n"; 
+                    print STDOUT join("\t",$id,"case d",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$lccn) . "\n"; 
                 }
 
             }
@@ -148,6 +169,10 @@ foreach $argnum ( 3 .. $#ARGV ) {
 			print STDERR "Missing record_type. " unless ($record_type);
 			print STDERR "Missing bib_lvl. " unless ($bib_lvl);
 			print STDERR "Missing title. " unless ($title);
+			print STDERR "Missing author. " unless ($author);
+			print STDERR "Missing publisher. " unless ($publisher);
+			print STDERR "Missing pubyear. " unless ($pubyear);
+			print STDERR "Missing pagination. " unless ($pagination);
 			print STDERR "\n";
 
 		}
diff --git a/spit_csv.pl b/spit_csv.pl
new file mode 100755
index 0000000..b0da245
--- /dev/null
+++ b/spit_csv.pl
@@ -0,0 +1,44 @@
+#!/usr/bin/perl
+use MARC::Batch;
+use MARC::File::XML ( BinaryEncoding => 'utf-8' );
+use MARC::Field;
+use Unicode::Normalize;
+
+
+my @desired_tags_subfields = ();
+foreach my $argnum ( 1 .. $#ARGV) {
+    push @desired_tags_subfields, $ARGV[$argnum];
+}
+
+my $count = 0;
+
+binmode(STDOUT, ':utf8');
+binmode(STDIN, ':utf8');
+
+foreach my $argnum ( 0 .. 0 ) {
+
+	print STDERR "Processing " . $ARGV[$argnum] . "\n";
+
+	my $batch = MARC::Batch->new('XML',$ARGV[$argnum]);
+	$batch->strict_off();
+	$batch->warnings_off();
+
+	while ( my $record = $batch->next() ) {
+
+        $count++;
+
+		print STDERR "WARNINGS: Record $count : " .  join(":",@warnings) . " : continuing...\n" if ( @warnings );
+
+        for (my $i = 0; $i < scalar(@desired_tags_subfields); $i+=2) {
+		    my @tags; if ($record->field($desired_tags_subfields[$i])) { @tags = $record->field($desired_tags_subfields[$i]); }
+            foreach my $f ( @tags ) { 
+                if ($f->subfield($desired_tags_subfields[$i+1])) { 
+                    print STDOUT $f->subfield($desired_tags_subfields[$i+1]) . "\t";
+                } 
+            }
+        }
+        print STDOUT "\n";
+
+	}
+	print STDERR "Processed $count records\n";
+}
-- 
1.7.2.5