X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=marc_cleanup;h=524109eb3dcde5e7bf41a9717eb01e55c142d6f2;hp=f978ef3d42fa808f71c229c7f0a0db184a5c4eb5;hb=ab57bcd894631f2952e28fca3ee548c6334fa6f1;hpb=a5e1b131e101f84386fade9385dfc4f731b59ae1 diff --git a/marc_cleanup b/marc_cleanup index f978ef3..524109e 100755 --- a/marc_cleanup +++ b/marc_cleanup @@ -1,4 +1,21 @@ #!/usr/bin/perl + +# Copyright 2009-2012, Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + require 5.10.0; use strict; @@ -76,7 +93,7 @@ while ( buildrecord() ) { until ($ptr == $#record) { # get datafield/tag data if we have it $rc = stow_record_data() if ($c->{'renumber-from'} and $c->{'original-tag'}); - return $rc if $rc; + next if $rc; # naked ampersands if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/) @@ -94,11 +111,11 @@ while ( buildrecord() ) { # subfields can't be non-alphanumeric if ($record[$ptr] =~ /{'force-utf8'}) { + if ($record[$ptr] =~ m|(.........).(.+)|) { + $record[$ptr] = "$1a$2\n"; + } + } if ($record[$ptr] =~ m|(.+?)|) { + $record[$ptr] =~ s|\s{10,}||; + message("Trailing whitespace trimmed from subfield contents"); + } + # automatable subfield maladies - $record[$ptr] =~ s/code=" ">c/code="c">/; - $record[$ptr] =~ s/code=" ">\$/code="c">\$/; + if ($record[$ptr] =~ /code=" ">c/) { + message('Fixing probable subfield c, scenario 1'); + $record[$ptr] =~ s/code=" ">c/code="c">/; + } + if ($record[$ptr] =~ /code=" ">\$/) { + message('Fixing probable subfield c, scenario 2'); + $record[$ptr] =~ s/code=" ">\$/code="c">\$/; + } + + if ($c->{'fix-subfield'}) { + if ($record[$ptr] =~ /code="&">/) { + message('Fixing & for subfield code'); + $record[$ptr] =~ s/code="&">/code="$c->{'fix-subfield'}">/; + } + if ($record[$ptr] =~ /code="(.*?\P{IsAlnum}.*?)">/) { + message("Fixing non-alphanumeric subfield code: $1 -> " . $c->{'fix-subfield'}); + $record[$ptr] =~ s/code=".*?\P{IsAlnum}.*?">/code="$c->{'fix-subfield'}">/; + } + if ($record[$ptr] =~ /code="">/) { + message('Fixing null subfield code'); + $record[$ptr] =~ s/code="">/code="$c->{'fix-subfield'}">/; + } + } } return 0; } @@ -299,7 +348,7 @@ sub buildrecord { my $istrash = 0; my $trash = $c->{trash}; - $l = while (defined $l and $l !~ //); + $l = while (defined $l and $l !~ /{ricount}++; @@ -592,6 +641,8 @@ sub initialize { 'renumber-subfield|rs=s', 'original-tag|ot=i', 'original-subfield|os=s', + 'fix-subfield|fs=s', + 'force-utf8', 'script', 'no-strip9', 'trashfile|t=s', @@ -609,6 +660,13 @@ sub initialize { $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'}; $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'}; $c->{window} = 9; + if ($c->{marcfile} and $c->{prefix}) { abort('You can not declare a marc file and prefix.'); } + if ($c->{marcfile}) { + $c->{output} = join('.',$c->{marcfile},'clean') + unless $c->{output}; + $c->{exception} = join('.',$c->{marcfile},'exception') + unless $c->{exception}; + } if ($c->{prefix}) { $c->{output} = join('.',$c->{prefix},'clean','marc','xml') unless $c->{output}; @@ -634,6 +692,12 @@ sub initialize { if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) ); } +sub abort { + my $msg = shift; + print STDERR "$0: $msg", "\n"; + exit 1; +} + sub show_help { print < @@ -655,13 +719,17 @@ Options and renumbering is in effect, an old-to-new mapping file (old2new.map) will be generated. - --autoscrub -a Automatically remove non-numeric tags in data - --nocollapse -n Don't compress records to one line on output - --no-strip9 Don't autoremove 901/903 tags in data - --trashfile -t File containing trash tag data (see --trashhelp) - - --fullauto No manual edits. All problematic records dumped to - exception file. + --force-utf8 Rewrite each record so that they describe themselves as + UTF-8 encoded + --autoscrub -a Automatically remove non-numeric tags in data + --fix-subfield -fs Subfield code to use in place of non-alphanumeric + or empty subfield codes + --nocollapse -n Don't compress records to one line on output + --no-strip9 Don't autoremove 901/903 tags in data + --trashfile -t File containing trash tag data (see --trashhelp) + + --fullauto No manual edits. All problematic records dumped to + exception file. HELP exit;