use strict;
use warnings;
-my $skip = shift || 0;
+use Getopt::Long;
+use Term::ReadLine;
+
+my $term = new Term::ReadLine 'yaz-cleanup';
+my $OUT = $term->OUT || \*STDOUT;
+
my $count = 0;
-$| = 1;
+my $line = '';
+
+my @record = ();
+my @context= ();
+
+my %commands = ( '?' => \&help,
+ h => \&help,
+ c => \&print_context,
+ d => \&dump_record,
+ q => \&quit,
+ );
+
open MARC, '<', 'incoming.marc.xml';
open NUMARC, '>', 'incoming.clean.marc.xml';
-if ($skip) {
- until ($count == ($skip - 1)) {
- my $t = <MARC>;
- print NUMARC $t;
- $count++;
- printf("\rSpinning on to record %s (%2.2f%%)", $skip, ($count / $skip *100))
- unless ($count % 1000);
- }
- print "\nScrubbing resumes...\n" if $skip;
-}
-
-my $line1 = <MARC>;
+my $line1 = getline();
-while (my $line2 = <MARC>) {
- $count++;
+while (my $line2 = getline()) {
# catch empty datafield elements
if ($line1 =~ m/<datafield tag="..." ind1="." ind2=".">/) {
if ($line2 =~ m|</datafield>|) {
print "Empty datafield scrubbed at line $count\n";
- $line1 = <MARC>;
- $count++;
+ $line1 = getline();
next;
}
}
$line1 =~ s/tag="(\d\d) /tag="0$1/g;
# naked ampersands
- die "Looks like naked ampersand at line $count: $line1"
+ edit("Looks like naked ampersand", $line1)
if ($line1 =~ /&/ && $line1 !~ /&\w{1,7};/);
# subfields can't be non-alphanumeric
$line1 = $line2;
}
print NUMARC $line1;
+
+sub edit {
+ my ($msg, $line_in) = @_;
+ print $OUT "\n".$msg, " at line $count:\n";
+ print $OUT "\t$line_in\n";
+ while (1) {
+ my $line = $term->readline('yaz-cleanup>');
+ $commands{$line}->();
+ }
+}
+
+sub print_context {
+ print $OUT "\n", join(' ','',@context[0..2]);
+ print $OUT '==>', $context[3];
+ print $OUT ' ', $context[4],"\n";
+}
+
+sub getline {
+ my $l = <MARC>;
+ $count++;
+ if (defined $l) {
+ if ($l =~ /<record>/)
+ { @record = ($l) }
+ else
+ { push @record, $l }
+ push @context, $l;
+ shift @context if (@context > 5);
+ }
+ return $l;
+}
+
+sub help {
+print $OUT <<HELP;
+
+Enter a replacement for this line, a blank line to dump this line, or a command.
+Commands: c Show line context
+ d Dump this record (redirect to exceptions file)
+ q Quit
+
+HELP
+}
+
+sub quit { exit }