#!/usr/bin/perl
+
+# Copyright 2009-2012, Equinox Software, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
require 5.10.0;
use strict;
until ($ptr == $#record) {
# get datafield/tag data if we have it
$rc = stow_record_data() if ($c->{'renumber-from'} and $c->{'original-tag'});
- return $rc if $rc;
+ next if $rc;
# naked ampersands
if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/)
# subfields can't be non-alphanumeric
if ($record[$ptr] =~ /<subfield code="(.*?)"/) {
if ($1 =~ /\P{IsAlnum}/ or $1 eq '') {
- edit("Junk in subfield code/Null subfield code");
+ edit("Junk in subfield code/Null subfield code ($1)");
next;
}
}
- # subfields can't be non-alphanumeric
+ # subfields can't be larger than 1 char (technically you could make the MARC format accomodate that:)
if ($record[$ptr] =~ /<subfield code="(\w{2,})"/) {
edit("Subfield code larger than 1 char");
next;
message("Short leader padded");
}
}
+ if ($c->{'force-utf8'}) {
+ if ($record[$ptr] =~ m|<leader>(.........).(.+)</leader>|) {
+ $record[$ptr] = "<leader>$1a$2</leader>\n";
+ }
+ }
if ($record[$ptr] =~ m|<controlfield tag="008">(.+?)</control|) {
#pad short 008
my $content = $1;
message("Dollar sign corrected");
}
+ # excessive trailing whitespace in subfield contents
+ if ($record[$ptr] =~ m|\s{10,}</subfield>|) {
+ $record[$ptr] =~ s|\s{10,}</subfield>|</subfield>|;
+ message("Trailing whitespace trimmed from subfield contents");
+ }
+
# automatable subfield maladies
- $record[$ptr] =~ s/code=" ">c/code="c">/;
- $record[$ptr] =~ s/code=" ">\$/code="c">\$/;
+ if ($record[$ptr] =~ /code=" ">c/) {
+ message('Fixing probable subfield c, scenario 1');
+ $record[$ptr] =~ s/code=" ">c/code="c">/;
+ }
+ if ($record[$ptr] =~ /code=" ">\$/) {
+ message('Fixing probable subfield c, scenario 2');
+ $record[$ptr] =~ s/code=" ">\$/code="c">\$/;
+ }
+
+ if ($c->{'fix-subfield'}) {
+ if ($record[$ptr] =~ /code="&">/) {
+ message('Fixing & for subfield code');
+ $record[$ptr] =~ s/code="&">/code="$c->{'fix-subfield'}">/;
+ }
+ if ($record[$ptr] =~ /code="(.*?\P{IsAlnum}.*?)">/) {
+ message("Fixing non-alphanumeric subfield code: $1 -> " . $c->{'fix-subfield'});
+ $record[$ptr] =~ s/code=".*?\P{IsAlnum}.*?">/code="$c->{'fix-subfield'}">/;
+ }
+ if ($record[$ptr] =~ /code="">/) {
+ message('Fixing null subfield code');
+ $record[$ptr] =~ s/code="">/code="$c->{'fix-subfield'}">/;
+ }
+ }
}
return 0;
}
sub stow_record_data {
# get tag data if we're looking at it
my $tag = 0;
- if ($record[$ptr] =~ m/<(control|data)field tag="(.{3})"/) {
+ if ($record[$ptr] =~ m/<(?:control|data)field tag="(.{3})"/) {
$recmeta{tag} = $1;
$tag = $recmeta{tag};
$record[$ptr] =~ m/ind1="(.)"/;
my $istrash = 0;
my $trash = $c->{trash};
- $l = <MARC> while (defined $l and $l !~ /<record>/);
+ $l = <MARC> while (defined $l and $l !~ /<record/);
return $l unless defined $l;
$c->{ricount}++;
'renumber-subfield|rs=s',
'original-tag|ot=i',
'original-subfield|os=s',
+ 'fix-subfield|fs=s',
+ 'force-utf8',
'script',
'no-strip9',
'trashfile|t=s',
show_trashhelp() if ($c->{trashhelp});
# defaults
- my $pfx = $c->{prefix} // "bibs";
+ my $pfx = defined($c->{prefix}) ? $c->{prefix} : "bibs";
$c->{ricount} = 0;
$c->{rocount} = 0;
$c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'};
$c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'};
$c->{window} = 9;
+ if ($c->{marcfile} and $c->{prefix}) { abort('You can not declare a marc file and prefix.'); }
+ if ($c->{marcfile}) {
+ $c->{output} = join('.',$c->{marcfile},'clean')
+ unless $c->{output};
+ $c->{exception} = join('.',$c->{marcfile},'exception')
+ unless $c->{exception};
+ }
if ($c->{prefix}) {
$c->{output} = join('.',$c->{prefix},'clean','marc','xml')
unless $c->{output};
if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) );
}
+sub abort {
+ my $msg = shift;
+ print STDERR "$0: $msg", "\n";
+ exit 1;
+}
+
sub show_help {
print <<HELP;
Usage is: marc_cleanup [OPTIONS] <filelist>
--output -o Cleaned MARCXML output filename
--exception -x Exception (dumped records) MARCXML filename
or
- --prefix=<PREFIX>> -p Shared prefix for output/exception files. Will produce
+ --prefix=<PREFIX> -p Shared prefix for output/exception files. Will produce
PREFIX.clean.marc.xml and PREFIX.exception.marc.xml
--marcfile -m Input filename. Defaults to PREFIX.marc.xml
and renumbering is in effect, an old-to-new mapping
file (old2new.map) will be generated.
- --autoscrub -a Automatically remove non-numeric tags in data
- --nocollapse -n Don't compress records to one line on output
- --no-strip9 Don't autoremove 901/903 tags in data
- --trashfile -t File containing trash tag data (see --trashhelp)
-
- --fullauto No manual edits. All problematic records dumped to
- exception file.
+ --force-utf8 Rewrite each record so that they describe themselves as
+ UTF-8 encoded
+ --autoscrub -a Automatically remove non-numeric tags in data
+ --fix-subfield -fs Subfield code to use in place of non-alphanumeric
+ or empty subfield codes
+ --nocollapse -n Don't compress records to one line on output
+ --no-strip9 Don't autoremove 901/903 tags in data
+ --trashfile -t File containing trash tag data (see --trashhelp)
+
+ --fullauto No manual edits. All problematic records dumped to
+ exception file.
HELP
exit;