10 open MARC, '<', 'incoming.marc.xml';
11 open NUMARC, '>', 'incoming.clean.marc.xml';
14 until ($count == ($skip - 1)) {
18 printf("\rSpinning on to record %s (%2.2f%%)", $skip, ($count / $skip *100))
19 unless ($count % 1000);
21 print "\nScrubbing resumes...\n" if $skip;
26 while (my $line2 = <MARC>) {
28 # catch empty datafield elements
29 if ($line1 =~ m/<datafield tag="..." ind1="." ind2=".">/) {
30 if ($line2 =~ m|</datafield>|) {
31 print "Empty datafield scrubbed at line $count\n";
38 # clean misplaced dollarsigns
39 if ($line1 =~ m|<subfield code="\$">c?\d+\.\d{2}|) {
40 $line1 =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|;
41 print "Dollar sign in subfield code corrected at line $count\n";
44 # clean up tags with spaces in them
45 $line1 =~ s/tag=" /tag="00/g;
46 $line1 =~ s/tag=" /tag="0/g;
47 $line1 =~ s/tag="-/tag="0/g;
48 $line1 =~ s/tag="(\d\d) /tag="0$1/g;
51 die "Looks like naked ampersand at line $count: $line1"
52 if ($line1 =~ /&/ && $line1 !~ /&\w{1,7};/);
54 # subfields can't be non-alphanumeric
55 die "Junk in subfield at line $count: $line1"
56 if $line1 =~ /<subfield code="[^[:alnum:]]"/;