From fa0fdbba6133696f34d496242f1d8ea9a37d626c Mon Sep 17 00:00:00 2001 From: Shawn Boyette Date: Tue, 5 Aug 2008 22:02:23 +0000 Subject: [PATCH] moving fingerprint script --- fingerprinter | 241 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fingerprints.pl | 241 ------------------------------------------------------- 2 files changed, 241 insertions(+), 241 deletions(-) create mode 100755 fingerprinter delete mode 100755 fingerprints.pl diff --git a/fingerprinter b/fingerprinter new file mode 100755 index 0000000..c5e5abb --- /dev/null +++ b/fingerprinter @@ -0,0 +1,241 @@ +#!/usr/bin/perl +use strict; +use warnings; +use open ':utf8'; + +use Getopt::Long; +use MARC::Batch; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +use MARC::Field; +use Unicode::Normalize; + +my $conf = {}; # configuration hashref +my $marc = {}; # MARC record hashref +my $count = 0; +initialyze($conf); + +for my $file (@ARGV) { + + print STDERR "Processing $file\n"; + + open my $M, '<:utf8', $file; + + my $batch = MARC::Batch->new('XML',$M); + $batch->strict_off(); + $batch->warnings_off(); + + while ( my $record = $batch->next ) { + $count++; + + my $id = $record->field($conf->{tag}); + unless ($id) { + print STDERR "ERROR: This record is missing a", $conf->{tag}, + "field.\n", $record->as_formatted(), "\n=====\n"; + next; + } + $id = $id->as_string($conf->{subfield}); + + my $leader = $record->leader(); + my $record_type = substr($leader,6,1); + my $bib_lvl = substr($leader,7,1); + + my $my_008 = $record->field('008'); + $my_008 = $my_008->as_string() if ($my_008); + my $date1 = substr($my_008,7,4) if ($my_008); + my $date2 = substr($my_008,11,4) if ($my_008); + + my $item_form; + if ( $record_type =~ /[gkroef]/ ) { # MAP, VIS + $item_form = substr($my_008,29,1) if ($my_008); + } else { + $item_form = substr($my_008,23,1) if ($my_008); + } + + my $title = $record->field('245'); + $title = $title->subfield('a') if $title; + + my @isbns = (); + my @isbns_020; + if ($record->field('020')) { @isbns_020 = $record->field('020'); } + foreach my $f ( @isbns_020 ) { + if ($f->subfield('a')) { + if ( $f->subfield('a')=~/(\S+)/ ) { + push @isbns, $1; + } + } + } + my @isbns_024; + if ($record->field('024')) { @isbns_024 = $record->field('024'); } + foreach my $f ( @isbns_024 ) { + if ($f->subfield('a')) { + if ( $f->subfield('a')=~/(\S+)/ ) { + push @isbns, $1; + } + } + } + + my $issn = $record->field('022'); + $issn = $issn->subfield('a') if $issn; + + my $lccn = $record->field('010'); + $lccn = $lccn->subfield('a') if $lccn; + + my $author; + if ($record->field('100')) + { $author = $record->field('100')->subfield('a'); } + unless ( $author ) { + $author = $record->field('110')->subfield('a') + if ($record->field('110')); + $author = $record->field('111')->subfield('a') + if ($record->field('111')); + } + + my $desc = $record->field('300'); + $desc = $desc->subfield('a') if $desc; + + my $pages; + $pages = $1 if (defined $desc and $desc =~ /(\d+)/); + + my $my_260 = $record->field('260'); + my $publisher = $my_260->subfield('b') if $my_260; + my $pubyear = $my_260->subfield('c') if $my_260; + if ( $pubyear ) { + if ( $pubyear =~ /(\d\d\d\d)/ ) + { $pubyear = $1; } + else + { $pubyear = ''; } + } + + my $edition = $record->field('250'); + $edition = $edition->subfield('a') if $edition; + + # NORMALIZE + $record_type = 'a' if ($record_type eq ' '); + if ($title) { + $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; + $title = lc($title); + $title =~ s/\W+$//go; + } + if ($author) { + $author = NFD($author); $author =~ s/[\x{80}-\x{ffff}]//go; + $author = lc($author); + $author =~ s/\W+$//go; + if ($author =~ /^(\w+)/) { + $author = $1; + } + } + if ($publisher) { + $publisher = NFD($publisher); $publisher =~ s/[\x{80}-\x{ffff}]//go; + $publisher = lc($publisher); + $publisher =~ s/\W+$//go; + if ($publisher =~ /^(\w+)/) { + $publisher = $1; + } + } + + # SPIT OUT FINGERPRINTS FROM THE "LOIS ALGORITHM" + # If we're not getting good matches, we may want to change this. + # The same thing goes for some other fields. + if ($item_form && ($date1 =~ /\d\d\d\d/) + && $record_type && $bib_lvl && $title) { + if ($conf->{runtype} eq "primary") { + print STDOUT + join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title) + ,"\n"; + } else { + # case a : isbn and pages + if (scalar(@isbns)>0 && $pages) { + foreach my $isbn ( @isbns ) { + print STDOUT + join("\t", $id, "case a", $item_form, $date1, + $record_type, $bib_lvl, $title, $isbn, $pages) + ,"\n"; + } + } + # case b : edition + if ($edition) { + print STDOUT + join("\t", $id, "case b", $item_form, $date1, + $record_type, $bib_lvl, $title,$edition), "\n"; + } + # case c : issn + if ($issn) { + print STDOUT join("\t", $id, "case c", $item_form, $date1, + $record_type, $bib_lvl, $title, $issn) + ,"\n"; + } + # case d : lccn + if ($lccn) { + print STDOUT join("\t", $id, "case d", $item_form, $date1, + $record_type, $bib_lvl, $title, $lccn) + ,"\n"; + } + # case e : author, publisher, pubyear, pages + if ($author && $publisher && $pubyear && $pages) { + print STDOUT join("\t", $id, "case e", $item_form, $date1, + $record_type, $bib_lvl, $title, $author, + $publisher, $pubyear, $pages), "\n"; + } + } + } else { + print STDERR "Record " . $id . " did not make the cut: "; + print STDERR "Missing item_form. " unless ($item_form); + print STDERR "Missing valid date1. " unless ($date1 =~ /\d\d\d\d/); + print STDERR "Missing record_type. " unless ($record_type); + print STDERR "Missing bib_lvl. " unless ($bib_lvl); + print STDERR "Missing title. " unless ($title); + print STDERR "\n"; + } + } + print STDERR "Processed $count records\n"; +} + +=head2 initialyze + +Performs boring script initialization. Handles argument parsing, +mostly. + +=cut + +sub initialyze { + my ($c) = @_; + my @missing = (); + + # set mode on existing filehandles + binmode(STDOUT, ':utf8'); + binmode(STDIN, ':utf8'); + + my $rc = GetOptions( $c, + 'runtype|r=s', + 'tag|t=s', + 'subfield|s=s', + ); + show_help() unless $rc; + my @keys = keys %{$c}; + show_help() unless (@ARGV and @keys); + + for my $key ('runtype', 'tag', 'subfield') { + push @missing, $key unless $c->{$key} + } + if (@missing) { + print "Required option: ", join(', ', @missing), " missing!\n"; + show_help(); + } +} + +=head2 show_help + +=cut + +sub show_help { +print < +Req'd Arguments + --runtype=(primary|full) -r Do 'primary' or 'full' fingerprinting + --tag=N -t Which tag to use + --subfield=X -s Which subfield to use +Options + None yet... +HELP +exit 1; +} diff --git a/fingerprints.pl b/fingerprints.pl deleted file mode 100755 index c5e5abb..0000000 --- a/fingerprints.pl +++ /dev/null @@ -1,241 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; -use open ':utf8'; - -use Getopt::Long; -use MARC::Batch; -use MARC::File::XML ( BinaryEncoding => 'utf-8' ); -use MARC::Field; -use Unicode::Normalize; - -my $conf = {}; # configuration hashref -my $marc = {}; # MARC record hashref -my $count = 0; -initialyze($conf); - -for my $file (@ARGV) { - - print STDERR "Processing $file\n"; - - open my $M, '<:utf8', $file; - - my $batch = MARC::Batch->new('XML',$M); - $batch->strict_off(); - $batch->warnings_off(); - - while ( my $record = $batch->next ) { - $count++; - - my $id = $record->field($conf->{tag}); - unless ($id) { - print STDERR "ERROR: This record is missing a", $conf->{tag}, - "field.\n", $record->as_formatted(), "\n=====\n"; - next; - } - $id = $id->as_string($conf->{subfield}); - - my $leader = $record->leader(); - my $record_type = substr($leader,6,1); - my $bib_lvl = substr($leader,7,1); - - my $my_008 = $record->field('008'); - $my_008 = $my_008->as_string() if ($my_008); - my $date1 = substr($my_008,7,4) if ($my_008); - my $date2 = substr($my_008,11,4) if ($my_008); - - my $item_form; - if ( $record_type =~ /[gkroef]/ ) { # MAP, VIS - $item_form = substr($my_008,29,1) if ($my_008); - } else { - $item_form = substr($my_008,23,1) if ($my_008); - } - - my $title = $record->field('245'); - $title = $title->subfield('a') if $title; - - my @isbns = (); - my @isbns_020; - if ($record->field('020')) { @isbns_020 = $record->field('020'); } - foreach my $f ( @isbns_020 ) { - if ($f->subfield('a')) { - if ( $f->subfield('a')=~/(\S+)/ ) { - push @isbns, $1; - } - } - } - my @isbns_024; - if ($record->field('024')) { @isbns_024 = $record->field('024'); } - foreach my $f ( @isbns_024 ) { - if ($f->subfield('a')) { - if ( $f->subfield('a')=~/(\S+)/ ) { - push @isbns, $1; - } - } - } - - my $issn = $record->field('022'); - $issn = $issn->subfield('a') if $issn; - - my $lccn = $record->field('010'); - $lccn = $lccn->subfield('a') if $lccn; - - my $author; - if ($record->field('100')) - { $author = $record->field('100')->subfield('a'); } - unless ( $author ) { - $author = $record->field('110')->subfield('a') - if ($record->field('110')); - $author = $record->field('111')->subfield('a') - if ($record->field('111')); - } - - my $desc = $record->field('300'); - $desc = $desc->subfield('a') if $desc; - - my $pages; - $pages = $1 if (defined $desc and $desc =~ /(\d+)/); - - my $my_260 = $record->field('260'); - my $publisher = $my_260->subfield('b') if $my_260; - my $pubyear = $my_260->subfield('c') if $my_260; - if ( $pubyear ) { - if ( $pubyear =~ /(\d\d\d\d)/ ) - { $pubyear = $1; } - else - { $pubyear = ''; } - } - - my $edition = $record->field('250'); - $edition = $edition->subfield('a') if $edition; - - # NORMALIZE - $record_type = 'a' if ($record_type eq ' '); - if ($title) { - $title = NFD($title); $title =~ s/[\x{80}-\x{ffff}]//go; - $title = lc($title); - $title =~ s/\W+$//go; - } - if ($author) { - $author = NFD($author); $author =~ s/[\x{80}-\x{ffff}]//go; - $author = lc($author); - $author =~ s/\W+$//go; - if ($author =~ /^(\w+)/) { - $author = $1; - } - } - if ($publisher) { - $publisher = NFD($publisher); $publisher =~ s/[\x{80}-\x{ffff}]//go; - $publisher = lc($publisher); - $publisher =~ s/\W+$//go; - if ($publisher =~ /^(\w+)/) { - $publisher = $1; - } - } - - # SPIT OUT FINGERPRINTS FROM THE "LOIS ALGORITHM" - # If we're not getting good matches, we may want to change this. - # The same thing goes for some other fields. - if ($item_form && ($date1 =~ /\d\d\d\d/) - && $record_type && $bib_lvl && $title) { - if ($conf->{runtype} eq "primary") { - print STDOUT - join("\t",$id,$item_form,$date1,$record_type,$bib_lvl,$title) - ,"\n"; - } else { - # case a : isbn and pages - if (scalar(@isbns)>0 && $pages) { - foreach my $isbn ( @isbns ) { - print STDOUT - join("\t", $id, "case a", $item_form, $date1, - $record_type, $bib_lvl, $title, $isbn, $pages) - ,"\n"; - } - } - # case b : edition - if ($edition) { - print STDOUT - join("\t", $id, "case b", $item_form, $date1, - $record_type, $bib_lvl, $title,$edition), "\n"; - } - # case c : issn - if ($issn) { - print STDOUT join("\t", $id, "case c", $item_form, $date1, - $record_type, $bib_lvl, $title, $issn) - ,"\n"; - } - # case d : lccn - if ($lccn) { - print STDOUT join("\t", $id, "case d", $item_form, $date1, - $record_type, $bib_lvl, $title, $lccn) - ,"\n"; - } - # case e : author, publisher, pubyear, pages - if ($author && $publisher && $pubyear && $pages) { - print STDOUT join("\t", $id, "case e", $item_form, $date1, - $record_type, $bib_lvl, $title, $author, - $publisher, $pubyear, $pages), "\n"; - } - } - } else { - print STDERR "Record " . $id . " did not make the cut: "; - print STDERR "Missing item_form. " unless ($item_form); - print STDERR "Missing valid date1. " unless ($date1 =~ /\d\d\d\d/); - print STDERR "Missing record_type. " unless ($record_type); - print STDERR "Missing bib_lvl. " unless ($bib_lvl); - print STDERR "Missing title. " unless ($title); - print STDERR "\n"; - } - } - print STDERR "Processed $count records\n"; -} - -=head2 initialyze - -Performs boring script initialization. Handles argument parsing, -mostly. - -=cut - -sub initialyze { - my ($c) = @_; - my @missing = (); - - # set mode on existing filehandles - binmode(STDOUT, ':utf8'); - binmode(STDIN, ':utf8'); - - my $rc = GetOptions( $c, - 'runtype|r=s', - 'tag|t=s', - 'subfield|s=s', - ); - show_help() unless $rc; - my @keys = keys %{$c}; - show_help() unless (@ARGV and @keys); - - for my $key ('runtype', 'tag', 'subfield') { - push @missing, $key unless $c->{$key} - } - if (@missing) { - print "Required option: ", join(', ', @missing), " missing!\n"; - show_help(); - } -} - -=head2 show_help - -=cut - -sub show_help { -print < -Req'd Arguments - --runtype=(primary|full) -r Do 'primary' or 'full' fingerprinting - --tag=N -t Which tag to use - --subfield=X -s Which subfield to use -Options - None yet... -HELP -exit 1; -} -- 1.7.2.5