X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=marc_cleanup;h=e2a9390ecf6a33ce6cf52dc91b5664faf2989c36;hp=f0c3753a16cd036f65bfd666ed0933a4bad211af;hb=cbf98af9e9d79cee9f8c99267c3e39326487167e;hpb=ecd9a840150d8aac37606594e9abaf0ce4838dfb diff --git a/marc_cleanup b/marc_cleanup index f0c3753..e2a9390 100755 --- a/marc_cleanup +++ b/marc_cleanup @@ -1,4 +1,21 @@ #!/usr/bin/perl + +# Copyright 2009-2012, Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + require 5.10.0; use strict; @@ -76,7 +93,7 @@ while ( buildrecord() ) { until ($ptr == $#record) { # get datafield/tag data if we have it $rc = stow_record_data() if ($c->{'renumber-from'} and $c->{'original-tag'}); - return $rc if $rc; + next if $rc; # naked ampersands if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/) @@ -98,7 +115,7 @@ while ( buildrecord() ) { next; } } - # subfields can't be non-alphanumeric + # subfields can't be larger than 1 char (technically you could make the MARC format accomodate that:) if ($record[$ptr] =~ /|) { + $record[$ptr] =~ s|\s{10,}||; + message("Trailing whitespace trimmed from subfield contents"); + } + # automatable subfield maladies $record[$ptr] =~ s/code=" ">c/code="c">/; $record[$ptr] =~ s/code=" ">\$/code="c">\$/; + + if ($c->{'fix-subfield'}) { + $record[$ptr] =~ s/code="&">/code="$c->{'fix-subfield'}">/; + $record[$ptr] =~ s/code="\P{IsAlnum}">/code="$c->{'fix-subfield'}">/; + $record[$ptr] =~ s/code="">/code="$c->{'fix-subfield'}">/; + } } return 0; } @@ -186,7 +215,7 @@ sub do_automated_cleanups { sub stow_record_data { # get tag data if we're looking at it my $tag = 0; - if ($record[$ptr] =~ m/<(control|data)field tag="(.{3})"/) { + if ($record[$ptr] =~ m/<(?:control|data)field tag="(.{3})"/) { $recmeta{tag} = $1; $tag = $recmeta{tag}; $record[$ptr] =~ m/ind1="(.)"/; @@ -299,7 +328,7 @@ sub buildrecord { my $istrash = 0; my $trash = $c->{trash}; - $l = while (defined $l and $l !~ //); + $l = while (defined $l and $l !~ /{ricount}++; @@ -592,6 +621,7 @@ sub initialize { 'renumber-subfield|rs=s', 'original-tag|ot=i', 'original-subfield|os=s', + 'fix-subfield|fs=s', 'script', 'no-strip9', 'trashfile|t=s', @@ -603,7 +633,7 @@ sub initialize { show_trashhelp() if ($c->{trashhelp}); # defaults - my $pfx = $c->{prefix} // "bibs"; + my $pfx = defined($c->{prefix}) ? $c->{prefix} : "bibs"; $c->{ricount} = 0; $c->{rocount} = 0; $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'}; @@ -641,7 +671,7 @@ Options --output -o Cleaned MARCXML output filename --exception -x Exception (dumped records) MARCXML filename or - --prefix=> -p Shared prefix for output/exception files. Will produce + --prefix= -p Shared prefix for output/exception files. Will produce PREFIX.clean.marc.xml and PREFIX.exception.marc.xml --marcfile -m Input filename. Defaults to PREFIX.marc.xml @@ -655,13 +685,15 @@ Options and renumbering is in effect, an old-to-new mapping file (old2new.map) will be generated. - --autoscrub -a Automatically remove non-numeric tags in data - --nocollapse -n Don't compress records to one line on output - --no-strip9 Don't autoremove 901/903 tags in data - --trashfile -t File containing trash tag data (see --trashhelp) + --autoscrub -a Automatically remove non-numeric tags in data + --fix-subfield -fs Subfield code to use in place of non-alphanumeric + or empty subfield codes + --nocollapse -n Don't compress records to one line on output + --no-strip9 Don't autoremove 901/903 tags in data + --trashfile -t File containing trash tag data (see --trashhelp) - --fullauto No manual edits. All problematic records dumped to - exception file. + --fullauto No manual edits. All problematic records dumped to + exception file. HELP exit;