From 16c6b51d01c51074067332264b05f40124f7c8bd Mon Sep 17 00:00:00 2001 From: Galen Charlton Date: Wed, 30 Jul 2014 19:53:02 +0000 Subject: [PATCH] make the fingerprinter normalize OCLC control numbers Various ways of representing the same control number are now normalized to "(OCoLC)" followed by the numeric portion of the control number. This patch also introduces a new module, Equinox::Migration::Utils, which is meant for miscellaneous utility routines. Signed-off-by: Galen Charlton --- Equinox-Migration/lib/Equinox/Migration/Utils.pm | 88 ++++++++++++++++++++++ Equinox-Migration/t/05-Utils.t | 25 ++++++ fingerprinter | 17 +++- 3 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 Equinox-Migration/lib/Equinox/Migration/Utils.pm create mode 100644 Equinox-Migration/t/05-Utils.t diff --git a/Equinox-Migration/lib/Equinox/Migration/Utils.pm b/Equinox-Migration/lib/Equinox/Migration/Utils.pm new file mode 100644 index 0000000..f312b04 --- /dev/null +++ b/Equinox-Migration/lib/Equinox/Migration/Utils.pm @@ -0,0 +1,88 @@ +package Equinox::Migration::Utils; + +# Copyright 2014, Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +use strict; +use warnings; + +BEGIN { + require Exporter; + + our $VERSION = 1.00; + our @ISA = qw(Exporter); + our @EXPORT = (); + our @EXPORT_OK = qw(normalize_oclc_number); +} + +sub normalize_oclc_number { + my $str = shift; + + # trim + $str =~ s/^\s+//; + $str =~ s/\s+$//; + + # get rid of prefixes + $str =~ s/^\(OCoLC\)//i; + $str =~ s/^(ocl7|ocm|ocn|on)//i; + + # ... and any leading zeroes + $str =~ s/^0+//; + + if ($str =~ /^\d+$/) { + return '(OCoLC)' . $str; + } else { + return; + } +} + +=head1 NAME + +Equinox::Migration::Utils - utility functions + +=head1 SYNOPSIS + + use Equinox::Migration::Utils qw/normalize_oclc_number/; + my $normalized = normalize_oclc_number($oclc); + +=head1 FUNCTIONS + +=head2 normalize_oclc_number) + + my $normalized = normalize_oclc_number($oclc); + +Returns a normalized form of a string that is assumed to be +an OCLC control number. The normalized form consists of the +string "(OCoLC)" followed by the numeric portion of the OCLC +number, sans leading zeroes. + +The input string is expected to be a sequence of digits with +optional leading and trailing whitespace and an optional prefix +from a set observed in the wild, e.g., "(OCoLC)", "ocm", and so +forth. If the input string does not meet this condition, the +undefined value is returned. + +=head1 AUTHOR + +Galen Charlton + +=head1 COPYRIGHT + +Copyright 2014, Equinox Software Inc. + +=cut + +1; diff --git a/Equinox-Migration/t/05-Utils.t b/Equinox-Migration/t/05-Utils.t new file mode 100644 index 0000000..1198c62 --- /dev/null +++ b/Equinox-Migration/t/05-Utils.t @@ -0,0 +1,25 @@ +# Copyright 2014, Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +use strict; +use warnings; + +use Test::More tests => 3; +use Equinox::Migration::Utils qw/normalize_oclc_number/; + +is(normalize_oclc_number('ocm38548133'), '(OCoLC)38548133', 'prefixed with "ocm"'); +is(normalize_oclc_number(' ocm38548133 '), '(OCoLC)38548133', 'ignore leading/trailing whitespace'); +is(normalize_oclc_number('(OCoLC)ocm00123456'), '(OCoLC)123456', 'ignore leading zeroes in number'); diff --git a/fingerprinter b/fingerprinter index a3af67a..f388174 100755 --- a/fingerprinter +++ b/fingerprinter @@ -25,6 +25,7 @@ use MARC::Batch; use Unicode::Normalize; use MARC::File::XML ( BinaryEncoding => 'utf-8' ); use Equinox::Migration::SubfieldMapper; +use Equinox::Migration::Utils qw/normalize_oclc_number/; my $conf = {}; # configuration hashref my $count = 0; my $scount = 0; @@ -148,13 +149,19 @@ sub populate_marc { # oclc $marc{oclc} = []; - push @{ $marc{oclc} }, $record->field('001')->as_string() - if ($record->field('001') and $record->field('003') and - $record->field('003')->as_string() =~ /OCo{0,1}LC/); + if ($record->field('001') && + $record->field('003') && + $record->field('003')->as_string() =~ /OCo{0,1}LC/ && + defined normalize_oclc_number($record->field('001')->as_string())) { + push @{ $marc{oclc} }, normalize_oclc_number($record->field('001')->as_string()); + } for ($record->field('035')) { my $oclc = $_->subfield('a'); - push @{ $marc{oclc} }, $oclc - if (defined $oclc and $oclc =~ /\(OCoLC\)/ and $oclc =~/([0-9]+)/); + if (defined $oclc && + ($oclc =~ /\(OCoLC\)/ || $oclc =~ /(ocm|ocl7|ocn|on)/) && + defined normalize_oclc_number($oclc)) { + push @{ $marc{oclc} }, normalize_oclc_number($oclc); + } } if ($record->field('999')) { -- 1.7.2.5