From de9de34628637b78668e4ac358971d05cfb6b87f Mon Sep 17 00:00:00 2001 From: Jason Etheridge Date: Thu, 12 Jun 2008 16:31:29 +0000 Subject: [PATCH] Example: spit_csv.pl marc.xml 999 a 903 a ; ### produces a tab-delimited file, one line per record, with 999a as the first column and 903a as the second column --- fingerprints.pl | 71 +++++++++++++++++++++++++++++++++++++----------------- spit_csv.pl | 44 ++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 23 deletions(-) create mode 100755 spit_csv.pl diff --git a/fingerprints.pl b/fingerprints.pl index a7d43ee..8b78668 100755 --- a/fingerprints.pl +++ b/fingerprints.pl @@ -46,8 +46,39 @@ foreach $argnum ( 3 .. $#ARGV ) { $item_form = substr($my_008,23,1) if ($my_008); } - my $title = $record->field('245'); - if ( $title ) { $title = $title->subfield('a'); } + my @titles = (); + my $my_245 = $record->field('245'); + if ( $my_245 ) { + my $title = $my_245->subfield('a'); + $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title; + if ($my_245->subfield('b')) { + $title = $my_245->subfield('a') . ', ' . $my_245->subfield('b'); + $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title; + + $title = "_magic_prefix_for_special_case_1_" .$my_245->subfield('b'); + $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title; + } + if ($title->subfield('p')) { + $title = $my_245->subfield('a') . ', ' . $my_245->subfield('p'); + $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title; + } + my $my_440 = $record->field('440'); + if ($my_440 && $my_440->subfield('a')) { + $title = $my_440->subfield('a') . ', ' . $my_245->subfield('a'); + $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title; + + $title = "_magic_prefix_for_special_case_1_" .$my_245->subfield('a'); + $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title; + } + my $my_490 = $record->field('490'); + if ($my_490 && $my_490->subfield('a')) { + $title = $my_490->subfield('a') . ', ' . $my_245->subfield('a'); + $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title; + + $title = "_magic_prefix_for_special_case_1_" .$my_245->subfield('a'); + $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; $title = lc($title); $title =~ s/\W+$//go; $title =~ s/^\W+//go; push @titles, $title; + } + } my @isbns = (); my @isbns_020; if ($record->field('020')) { @isbns_020 = $record->field('020'); } @@ -69,8 +100,8 @@ foreach $argnum ( 3 .. $#ARGV ) { } my $desc = $record->field('300'); if ( $desc ) { $desc = $desc->subfield('a'); } - my $pages; - if ($desc =~ /(\d+)/) { $pages = $1; } + my $pagination; + if ($desc =~ /(\d+)/) { $pagination = $1; } my $my_260 = $record->field('260'); my $publisher = $my_260->subfield('b') if ( $my_260 ); my $pubyear = $my_260->subfield('c') if ( $my_260 ); @@ -82,11 +113,6 @@ foreach $argnum ( 3 .. $#ARGV ) { # NORMALIZE if ($record_type == ' ') { $record_type = 'a'; } - if ($title) { - $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; - $title = lc($title); - $title =~ s/\W+$//go; - } if ($author) { $author = NFD($author); $author =~ s/[\x{80}-\x{ffff}]//go; $author = lc($author); @@ -104,39 +130,34 @@ foreach $argnum ( 3 .. $#ARGV ) { } } - # SPIT OUT FINGERPRINTS FROM THE "LOIS ALGORITHM" + # SPIT OUT FINGERPRINTS FROM THE "MODIFIED LOIS ALGORITHM" # If we're not getting good matches, we may want to change this. The same thing goes for some other fields. - if ($item_form && ($date1 =~ /\d\d\d\d/) && $record_type && $bib_lvl && $title) { + if ($item_form && ($date1 =~ /\d\d\d\d/) && $record_type && $bib_lvl && $title && $author && $publisher && $pubyear && $pagination) { if ($which eq "primary") { - print STDOUT join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title) . "\n"; + print STDOUT join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination) . "\n"; } else { - # case a : isbn and pages - if (scalar(@isbns)>0 && $pages) { + # case a : isbn + if (scalar(@isbns)>0) { foreach my $isbn ( @isbns ) { - print STDOUT join("\t",$id,"case a",$item_form,$date1,$record_type,$bib_lvl,$title,$isbn,$pages) . "\n"; + print STDOUT join("\t",$id,"case a",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$isbn) . "\n"; } } # case b : edition if ($edition) { - print STDOUT join("\t",$id,"case b",$item_form,$date1,$record_type,$bib_lvl,$title,$edition) . "\n"; + print STDOUT join("\t",$id,"case b",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$edition) . "\n"; } # case c : issn if ($issn) { - print STDOUT join("\t",$id,"case c",$item_form,$date1,$record_type,$bib_lvl,$title,$issn) . "\n"; + print STDOUT join("\t",$id,"case c",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$issn) . "\n"; } # case d : lccn if ($lccn) { - print STDOUT join("\t",$id,"case d",$item_form,$date1,$record_type,$bib_lvl,$title,$lccn) . "\n"; - } - - # case e : author, publisher, pubyear, pages - if ($author && $publisher && $pubyear && $pages) { - print STDOUT join("\t",$id,"case e",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pages) . "\n"; + print STDOUT join("\t",$id,"case d",$item_form,$date1,$record_type,$bib_lvl,$title,$author,$publisher,$pubyear,$pagination,$lccn) . "\n"; } } @@ -148,6 +169,10 @@ foreach $argnum ( 3 .. $#ARGV ) { print STDERR "Missing record_type. " unless ($record_type); print STDERR "Missing bib_lvl. " unless ($bib_lvl); print STDERR "Missing title. " unless ($title); + print STDERR "Missing author. " unless ($author); + print STDERR "Missing publisher. " unless ($publisher); + print STDERR "Missing pubyear. " unless ($pubyear); + print STDERR "Missing pagination. " unless ($pagination); print STDERR "\n"; } diff --git a/spit_csv.pl b/spit_csv.pl new file mode 100755 index 0000000..b0da245 --- /dev/null +++ b/spit_csv.pl @@ -0,0 +1,44 @@ +#!/usr/bin/perl +use MARC::Batch; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Field; +use Unicode::Normalize; + + +my @desired_tags_subfields = (); +foreach my $argnum ( 1 .. $#ARGV) { + push @desired_tags_subfields, $ARGV[$argnum]; +} + +my $count = 0; + +binmode(STDOUT, ':utf8'); +binmode(STDIN, ':utf8'); + +foreach my $argnum ( 0 .. 0 ) { + + print STDERR "Processing " . $ARGV[$argnum] . "\n"; + + my $batch = MARC::Batch->new('XML',$ARGV[$argnum]); + $batch->strict_off(); + $batch->warnings_off(); + + while ( my $record = $batch->next() ) { + + $count++; + + print STDERR "WARNINGS: Record $count : " . join(":",@warnings) . " : continuing...\n" if ( @warnings ); + + for (my $i = 0; $i < scalar(@desired_tags_subfields); $i+=2) { + my @tags; if ($record->field($desired_tags_subfields[$i])) { @tags = $record->field($desired_tags_subfields[$i]); } + foreach my $f ( @tags ) { + if ($f->subfield($desired_tags_subfields[$i+1])) { + print STDOUT $f->subfield($desired_tags_subfields[$i+1]) . "\t"; + } + } + } + print STDOUT "\n"; + + } + print STDERR "Processed $count records\n"; +} -- 1.7.2.5