X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=marc_cleanup;h=58a68c163e0786ee2ef20ce154ce2de3f3b1e398;hp=f978ef3d42fa808f71c229c7f0a0db184a5c4eb5;hb=13e86d342c9b320afbbcc2c1007bcded07468599;hpb=a5e1b131e101f84386fade9385dfc4f731b59ae1
diff --git a/marc_cleanup b/marc_cleanup
index f978ef3..58a68c1 100755
--- a/marc_cleanup
+++ b/marc_cleanup
@@ -1,4 +1,21 @@
#!/usr/bin/perl
+
+# Copyright 2009-2012, Equinox Software, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
require 5.10.0;
use strict;
@@ -76,7 +93,7 @@ while ( buildrecord() ) {
until ($ptr == $#record) {
# get datafield/tag data if we have it
$rc = stow_record_data() if ($c->{'renumber-from'} and $c->{'original-tag'});
- return $rc if $rc;
+ next if $rc;
# naked ampersands
if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/)
@@ -94,11 +111,11 @@ while ( buildrecord() ) {
# subfields can't be non-alphanumeric
if ($record[$ptr] =~ /{'force-utf8'}) {
+ if ($record[$ptr] =~ m|(.........).(.+)|) {
+ $record[$ptr] = "$1a$2\n";
+ }
+ }
if ($record[$ptr] =~ m|(.+?)|) {
+ $record[$ptr] =~ s|\s{10,}||;
+ message("Trailing whitespace trimmed from subfield contents");
+ }
+
# automatable subfield maladies
- $record[$ptr] =~ s/code=" ">c/code="c">/;
- $record[$ptr] =~ s/code=" ">\$/code="c">\$/;
+ if ($record[$ptr] =~ /code=" ">c/) {
+ message('Fixing probable subfield c, scenario 1');
+ $record[$ptr] =~ s/code=" ">c/code="c">/;
+ }
+ if ($record[$ptr] =~ /code=" ">\$/) {
+ message('Fixing probable subfield c, scenario 2');
+ $record[$ptr] =~ s/code=" ">\$/code="c">\$/;
+ }
+
+ if ($c->{'fix-subfield'}) {
+ if ($record[$ptr] =~ /code="&">/) {
+ message('Fixing & for subfield code');
+ $record[$ptr] =~ s/code="&">/code="$c->{'fix-subfield'}">/;
+ }
+ if ($record[$ptr] =~ /code="(.*?\P{IsAlnum}.*?)">/) {
+ message("Fixing non-alphanumeric subfield code: $1 -> " . $c->{'fix-subfield'});
+ $record[$ptr] =~ s/code=".*?\P{IsAlnum}.*?">/code="$c->{'fix-subfield'}">/;
+ }
+ if ($record[$ptr] =~ /code="">/) {
+ message('Fixing null subfield code');
+ $record[$ptr] =~ s/code="">/code="$c->{'fix-subfield'}">/;
+ }
+ }
}
return 0;
}
@@ -299,7 +348,7 @@ sub buildrecord {
my $istrash = 0;
my $trash = $c->{trash};
- $l = while (defined $l and $l !~ //);
+ $l = while (defined $l and $l !~ /{ricount}++;
@@ -592,6 +641,8 @@ sub initialize {
'renumber-subfield|rs=s',
'original-tag|ot=i',
'original-subfield|os=s',
+ 'fix-subfield|fs=s',
+ 'force-utf8',
'script',
'no-strip9',
'trashfile|t=s',
@@ -655,13 +706,17 @@ Options
and renumbering is in effect, an old-to-new mapping
file (old2new.map) will be generated.
- --autoscrub -a Automatically remove non-numeric tags in data
- --nocollapse -n Don't compress records to one line on output
- --no-strip9 Don't autoremove 901/903 tags in data
- --trashfile -t File containing trash tag data (see --trashhelp)
-
- --fullauto No manual edits. All problematic records dumped to
- exception file.
+ --force-utf8 Rewrite each record so that they describe themselves as
+ UTF-8 encoded
+ --autoscrub -a Automatically remove non-numeric tags in data
+ --fix-subfield -fs Subfield code to use in place of non-alphanumeric
+ or empty subfield codes
+ --nocollapse -n Don't compress records to one line on output
+ --no-strip9 Don't autoremove 901/903 tags in data
+ --trashfile -t File containing trash tag data (see --trashhelp)
+
+ --fullauto No manual edits. All problematic records dumped to
+ exception file.
HELP
exit;