#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use Term::ReadLine;
my $term = new Term::ReadLine 'yaz-cleanup';
my $OUT = $term->OUT || \*STDOUT;
my $count = 0;
my $line = '';
my @record = ();
my @context= ();
my %commands = ( '?' => \&help,
h => \&help,
c => \&print_context,
d => \&dump_record,
q => \&quit,
);
open MARC, '<', 'incoming.marc.xml';
open NUMARC, '>', 'incoming.clean.marc.xml';
my $line1 = getline();
while (my $line2 = getline()) {
# catch empty datafield elements
if ($line1 =~ m//) {
if ($line2 =~ m||) {
print "Empty datafield scrubbed at line $count\n";
$line1 = getline();
next;
}
}
# clean misplaced dollarsigns
if ($line1 =~ m|c?\d+\.\d{2}|) {
$line1 =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|;
print "Dollar sign in subfield code corrected at line $count\n";
}
# clean up tags with spaces in them
$line1 =~ s/tag=" /tag="00/g;
$line1 =~ s/tag=" /tag="0/g;
$line1 =~ s/tag="-/tag="0/g;
$line1 =~ s/tag="(\d\d) /tag="0$1/g;
# naked ampersands
edit("Looks like naked ampersand", $line1)
if ($line1 =~ /&/ && $line1 !~ /&\w{1,7};/);
# subfields can't be non-alphanumeric
die "Junk in subfield at line $count: $line1"
if $line1 =~ /readline('yaz-cleanup>');
$commands{$line}->();
}
}
sub print_context {
print $OUT "\n", join(' ','',@context[0..2]);
print $OUT '==>', $context[3];
print $OUT ' ', $context[4],"\n";
}
sub getline {
my $l = ;
$count++;
if (defined $l) {
if ($l =~ //)
{ @record = ($l) }
else
{ push @record, $l }
push @context, $l;
shift @context if (@context > 5);
}
return $l;
}
sub help {
print $OUT <