#!/usr/bin/perl use strict; use warnings; use Getopt::Long; use Term::ReadLine; $| = 1; my $term = new Term::ReadLine 'yaz-cleanup'; my $OUT = $term->OUT || \*STDOUT; my $count = 0; my $reccount = 0; my $line = ''; my @record = (); # current record storage my @context= (); # last 5 lines of file open MARC, '<', 'incoming.marc.xml'; open my $NUMARC, '>', 'incoming.clean.marc.xml'; print $NUMARC '',"\n"; open my $EXMARC, '>', 'incoming.exceptions.marc.xml'; print $EXMARC '',"\n"; open MARC2, '<', 'incoming.marc.xml'; ; # this is the dispatch table which drives command selection in # edit(), below my %commands = ( c => \&print_context, k => \&kill_line, o => \&show_original, t => \&commit_edit, x => \&dump_record, q => \&quit, '?' => \&help, h => \&help, help => \&help, ); my @spinner = qw(- / | \\); my $sidx = 0; while (my $line = getline()) { unless ($count % 2000) { print "\rWorking... ", $spinner[$sidx]; $sidx++; $sidx = 0 if ($sidx > $#spinner); } update_context(); # catch empty datafield elements if ($line =~ m||) { if ($record[-2] =~ m//) { pop @record; pop @record; print $OUT "\rEmpty datafield scrubbed at line $count\n"; next; } } # clean misplaced dollarsigns if ($line =~ m|c?\d+\.\d{2}|) { $line =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|; print $OUT "\rDollar sign in subfield code corrected at line $count\n"; } # clean up tags with spaces in them $line =~ s/tag=" /tag="00/g; $line =~ s/tag=" /tag="0/g; $line =~ s/tag="-/tag="0/g; $line =~ s/tag="(\d\d) /tag="0$1/g; # naked ampersands if ($line =~ /&/ && $line !~ /&\w{1,7};/) { edit("Looks like naked ampersand", $line); next } # subfields can't be non-alphanumeric if ($line =~ /\n"; print $EXMARC "\n"; =head2 edit Handles the Term::ReadLine loop =cut sub edit { my ($msg, $line_in) = @_; print $OUT "\r".$msg, " at line $count:\n"; print_context(); while (1) { my $line = $term->readline('yaz-cleanup>'); if (defined $commands{$line}) { my $term = $commands{$line}->($line_in); last if $term; } else { if ($context[3] eq " [LINE KILLED]\n") { push @record, "$line\n" } else { $record[-1] = "$line\n"; } $context[3] = "$line\n"; print_context(); } } } =head2 getline Reads from the incoming MARC file; returns lines into the driver loop. Batches records for output, and maintains the context listing. =cut sub getline { my $l = ; $count++; if (defined $l) { if ($l =~ //) { @record = ($l); $reccount++; } elsif ($l =~ m||) { write_record($NUMARC) if $reccount; } else { push @record, $l; } } return $l; } sub write_record { my ($FH) = @_; print $FH '',"\n"; print $FH @record; print $FH "\n"; } sub update_context { my $line2 = ; push @context, $line2; shift @context if (@context > 5); } #----------------------------------------------------------------------------------- # command routines #----------------------------------------------------------------------------------- sub print_context { print $OUT "\n", join(' |','',@context[0..2]); print $OUT '==> |', $context[3]; print $OUT ' |', $context[4],"\n"; return 0; } sub show_original { my ($line_in) = @_; print $OUT "\n$line_in\n"; return 0; } sub commit_edit { return 1 } sub kill_line { pop @record; $context[3] = " [LINE KILLED]\n"; print_context(); return 0; } sub dump_record { my $line = ; $count++; update_context(); until ($line =~ m||) { push @record, $line; $line = ; $count++; update_context(); } push @record, $line; write_record($EXMARC); } sub help { print $OUT <