From b9e4ad4dde4619596902dbdb17d355ebc0a850dc Mon Sep 17 00:00:00 2001 From: Shawn Boyette Date: Fri, 19 Sep 2008 20:05:57 +0000 Subject: [PATCH] naked amp detect --- yaz-cleanup | 27 ++++++++++++++++++++------- 1 files changed, 20 insertions(+), 7 deletions(-) diff --git a/yaz-cleanup b/yaz-cleanup index b3d5eda..1f9d93d 100755 --- a/yaz-cleanup +++ b/yaz-cleanup @@ -10,14 +10,16 @@ $| = 1; open MARC, '<', 'incoming.marc.xml'; open NUMARC, '>', 'incoming.clean.marc.xml'; -until ($count == ($skip - 1)) { - my $t = ; - print NUMARC $t; - $count++; - printf("\rSpinning on to record %s (%2.2f%%)", $skip, ($count / $skip *100)) - unless ($count % 1000); +if ($skip) { + until ($count == ($skip - 1)) { + my $t = ; + print NUMARC $t; + $count++; + printf("\rSpinning on to record %s (%2.2f%%)", $skip, ($count / $skip *100)) + unless ($count % 1000); + } + print "\nScrubbing resumes...\n" if $skip; } -print "\nScrubbing resumes...\n" if $skip; my $line1 = ; @@ -26,18 +28,29 @@ while (my $line2 = ) { # catch empty datafield elements if ($line1 =~ m//) { if ($line2 =~ m||) { + print "Empty datafield scrubbed at line $count\n"; $line1 = ; $count++; next; } } + # clean misplaced dollarsigns + if ($line1 =~ m|c?\d+\.\d{2}|) { + $line1 =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|; + print "Dollar sign in subfield code corrected at line $count\n"; + } + # clean up tags with spaces in them $line1 =~ s/tag=" /tag="00/g; $line1 =~ s/tag=" /tag="0/g; $line1 =~ s/tag="-/tag="0/g; $line1 =~ s/tag="(\d\d) /tag="0$1/g; + # naked ampersands + die "Looks like naked ampersand at line $count: $line1" + if ($line1 =~ /&/ && $line1 !~ /&\w{1,7};/); + # subfields can't be non-alphanumeric die "Junk in subfield at line $count: $line1" if $line1 =~ /