X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=marc_cleanup;h=e0c20a781afd6f13c10402f017df98f913e9e8cf;hp=14d4e415e98284cfde48a17da95014140c44f80d;hb=1a3ca3ba98d644892f5fbad6084e515736394452;hpb=960b4ce2842d87080a9e577a118d436e26bf7303 diff --git a/marc_cleanup b/marc_cleanup index 14d4e41..e0c20a7 100755 --- a/marc_cleanup +++ b/marc_cleanup @@ -1,4 +1,21 @@ #!/usr/bin/perl + +# Copyright 2009-2012, Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + require 5.10.0; use strict; @@ -76,7 +93,7 @@ while ( buildrecord() ) { until ($ptr == $#record) { # get datafield/tag data if we have it $rc = stow_record_data() if ($c->{'renumber-from'} and $c->{'original-tag'}); - return $rc if $rc; + next if $rc; # naked ampersands if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/) @@ -176,6 +193,12 @@ sub do_automated_cleanups { message("Dollar sign corrected"); } + # excessive trailing whitespace in subfield contents + if ($record[$ptr] =~ m|\s{10,}|) { + $record[$ptr] =~ s|\s{10,}||; + message("Trailing whitespace trimmed from subfield contents"); + } + # automatable subfield maladies $record[$ptr] =~ s/code=" ">c/code="c">/; $record[$ptr] =~ s/code=" ">\$/code="c">\$/; @@ -186,13 +209,13 @@ sub do_automated_cleanups { sub stow_record_data { # get tag data if we're looking at it my $tag = 0; - if ($record[$ptr] =~ m/<(control|data)field tag="(?.{3})"/) { - $recmeta{tag} = $+{TAG}; + if ($record[$ptr] =~ m/<(?:control|data)field tag="(.{3})"/) { + $recmeta{tag} = $1; $tag = $recmeta{tag}; - $record[$ptr] =~ m/ind1="(?.)"/; - $recmeta{ind1} = $+{IND1} || ''; - $record[$ptr] =~ m/ind2="(?.)"/; - $recmeta{ind2} = $+{IND2} || ''; + $record[$ptr] =~ m/ind1="(.)"/; + $recmeta{ind1} = $1 || ''; + $record[$ptr] =~ m/ind2="(.)"/; + $recmeta{ind2} = $1 || ''; unless ($tag) { message("Autokill record: no detectable tag"); @@ -299,7 +322,7 @@ sub buildrecord { my $istrash = 0; my $trash = $c->{trash}; - $l = while (defined $l and $l !~ //); + $l = while (defined $l and $l !~ /{ricount}++; @@ -603,17 +626,21 @@ sub initialize { show_trashhelp() if ($c->{trashhelp}); # defaults - my $pfx = $c->{prefix} // "bibs"; + my $pfx = defined($c->{prefix}) ? $c->{prefix} : "bibs"; $c->{ricount} = 0; $c->{rocount} = 0; - $c->{output} = join('.',$c->{prefix},'clean','marc','xml'); - $c->{exception} = join('.',$c->{prefix},'exception','marc','xml'); $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'}; $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'}; $c->{window} = 9; - - $c->{marcfile} = $c->{prefix} . '.marc.xml' - unless $c->{marcfile}; + if ($c->{prefix}) { + $c->{output} = join('.',$c->{prefix},'clean','marc','xml') + unless $c->{output}; + $c->{exception} = join('.',$c->{prefix},'exception','marc','xml') + unless $c->{exception}; + $c->{marcfile} = $c->{prefix} . '.marc.xml' + unless $c->{marcfile}; + } + show_help() unless ($c->{marcfile} and $c->{output}); if ($c->{trashfile}) { $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $c->{trashfile}) @@ -628,19 +655,16 @@ sub initialize { # remove original id sequence tag from trash hash if we know it $c->{trash}->remove_tag($c->{'original-tag'}) if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) ); - - my @keys = keys %{$c}; - show_help() unless @keys; } sub show_help { print < +Usage is: marc_cleanup [OPTIONS] Options --output -o Cleaned MARCXML output filename --exception -x Exception (dumped records) MARCXML filename or - --prefix=> -p Shared prefix for output/exception files. Will produce + --prefix= -p Shared prefix for output/exception files. Will produce PREFIX.clean.marc.xml and PREFIX.exception.marc.xml --marcfile -m Input filename. Defaults to PREFIX.marc.xml @@ -662,8 +686,6 @@ Options --fullauto No manual edits. All problematic records dumped to exception file. - --script Store human-initiated ops in scriptfile (.mcscript) - Not yet implemented HELP exit; }