open MARC, '<', 'incoming.marc.xml';
open NUMARC, '>', 'incoming.clean.marc.xml';
-until ($count == ($skip - 1)) {
- my $t = <MARC>;
- print NUMARC $t;
- $count++;
- printf("\rSpinning on to record %s (%2.2f%%)", $skip, ($count / $skip *100))
- unless ($count % 1000);
+if ($skip) {
+ until ($count == ($skip - 1)) {
+ my $t = <MARC>;
+ print NUMARC $t;
+ $count++;
+ printf("\rSpinning on to record %s (%2.2f%%)", $skip, ($count / $skip *100))
+ unless ($count % 1000);
+ }
+ print "\nScrubbing resumes...\n" if $skip;
}
-print "\nScrubbing resumes...\n" if $skip;
my $line1 = <MARC>;
# catch empty datafield elements
if ($line1 =~ m/<datafield tag="..." ind1="." ind2=".">/) {
if ($line2 =~ m|</datafield>|) {
+ print "Empty datafield scrubbed at line $count\n";
$line1 = <MARC>;
$count++;
next;
}
}
+ # clean misplaced dollarsigns
+ if ($line1 =~ m|<subfield code="\$">c?\d+\.\d{2}|) {
+ $line1 =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|;
+ print "Dollar sign in subfield code corrected at line $count\n";
+ }
+
# clean up tags with spaces in them
$line1 =~ s/tag=" /tag="00/g;
$line1 =~ s/tag=" /tag="0/g;
$line1 =~ s/tag="-/tag="0/g;
$line1 =~ s/tag="(\d\d) /tag="0$1/g;
+ # naked ampersands
+ die "Looks like naked ampersand at line $count: $line1"
+ if ($line1 =~ /&/ && $line1 !~ /&\w{1,7};/);
+
# subfields can't be non-alphanumeric
die "Junk in subfield at line $count: $line1"
if $line1 =~ /<subfield code="[^[:alnum:]]"/;