#!/usr/bin/perl use strict; use warnings; my $skip = shift || 0; my $count = 0; $| = 1; open MARC, '<', 'incoming.marc.xml'; open NUMARC, '>', 'incoming.clean.marc.xml'; if ($skip) { until ($count == ($skip - 1)) { my $t = ; print NUMARC $t; $count++; printf("\rSpinning on to record %s (%2.2f%%)", $skip, ($count / $skip *100)) unless ($count % 1000); } print "\nScrubbing resumes...\n" if $skip; } my $line1 = ; while (my $line2 = ) { $count++; # catch empty datafield elements if ($line1 =~ m//) { if ($line2 =~ m||) { print "Empty datafield scrubbed at line $count\n"; $line1 = ; $count++; next; } } # clean misplaced dollarsigns if ($line1 =~ m|c?\d+\.\d{2}|) { $line1 =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|; print "Dollar sign in subfield code corrected at line $count\n"; } # clean up tags with spaces in them $line1 =~ s/tag=" /tag="00/g; $line1 =~ s/tag=" /tag="0/g; $line1 =~ s/tag="-/tag="0/g; $line1 =~ s/tag="(\d\d) /tag="0$1/g; # naked ampersands die "Looks like naked ampersand at line $count: $line1" if ($line1 =~ /&/ && $line1 !~ /&\w{1,7};/); # subfields can't be non-alphanumeric die "Junk in subfield at line $count: $line1" if $line1 =~ /