#!/usr/bin/perl use strict; use warnings; use Getopt::Long; use Term::ReadLine; my $term = new Term::ReadLine 'yaz-cleanup'; my $OUT = $term->OUT || \*STDOUT; my $count = 0; my $line = ''; my @record = (); my @context= (); my %commands = ( '?' => \&help, h => \&help, c => \&print_context, d => \&dump_record, q => \&quit, ); open MARC, '<', 'incoming.marc.xml'; open NUMARC, '>', 'incoming.clean.marc.xml'; my $line1 = getline(); while (my $line2 = getline()) { # catch empty datafield elements if ($line1 =~ m//) { if ($line2 =~ m||) { print "Empty datafield scrubbed at line $count\n"; $line1 = getline(); next; } } # clean misplaced dollarsigns if ($line1 =~ m|c?\d+\.\d{2}|) { $line1 =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|; print "Dollar sign in subfield code corrected at line $count\n"; } # clean up tags with spaces in them $line1 =~ s/tag=" /tag="00/g; $line1 =~ s/tag=" /tag="0/g; $line1 =~ s/tag="-/tag="0/g; $line1 =~ s/tag="(\d\d) /tag="0$1/g; # naked ampersands edit("Looks like naked ampersand", $line1) if ($line1 =~ /&/ && $line1 !~ /&\w{1,7};/); # subfields can't be non-alphanumeric die "Junk in subfield at line $count: $line1" if $line1 =~ /readline('yaz-cleanup>'); $commands{$line}->(); } } sub print_context { print $OUT "\n", join(' ','',@context[0..2]); print $OUT '==>', $context[3]; print $OUT ' ', $context[4],"\n"; } sub getline { my $l = ; $count++; if (defined $l) { if ($l =~ //) { @record = ($l) } else { push @record, $l } push @context, $l; shift @context if (@context > 5); } return $l; } sub help { print $OUT <