#!/usr/bin/perl use strict; use warnings; use Getopt::Long; use Term::ReadLine; $| = 1; my $term = new Term::ReadLine 'yaz-cleanup'; my $OUT = $term->OUT || \*STDOUT; my $count = 0; my $reccount = 0; my $line = ''; my @record = (); # current record storage my %reccontext = (); my @linecontext= (); # last 5 lines of file my $input = shift || 'incoming.marc.xml'; open MARC, '<', $input; open my $NUMARC, '>', 'incoming.clean.marc.xml'; print $NUMARC '',"\n"; open my $EXMARC, '>', 'incoming.exceptions.marc.xml'; print $EXMARC '',"\n"; open MARC2, '<', $input; ; # this is the dispatch table which drives command selection in # edit(), below my %commands = ( c => \&print_context, C => \&print_linecontext, k => \&kill_line, o => \&show_original, m => \&merge_lines, t => \&commit_edit, x => \&dump_record, q => \&quit, '?' => \&help, h => \&help, help => \&help, ); my @spinner = qw(- / | \\); my $sidx = 0; while (my $line = getline()) { unless ($count % 2000) { print "\rWorking... ", $spinner[$sidx]; $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1; } update_linecontext(); # catch empty datafield elements if ($line =~ m||) { if ($record[-2] =~ m//) { pop @record; pop @record; print $OUT "\rEmpty datafield scrubbed at line $count\n"; next; } } # clean misplaced dollarsigns if ($line =~ m|c?\d+\.\d{2}|) { $line =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|; print $OUT "\rDollar sign in subfield code corrected at line $count\n"; } # clean up tags with spaces in them $line =~ s/tag=" /tag="00/g; $line =~ s/tag=" /tag="0/g; $line =~ s/tag="-/tag="0/g; $line =~ s/tag="(\d\d) /tag="0$1/g; # stow tag data if we're looking at it if ($line =~ m//) { $reccontext{tag} = $1; $reccontext{ind1} = $2; $reccontext{ind2} = $3; } # naked ampersands if ($line =~ /&/ && $line !~ /&\w+?;/) { edit("Naked ampersand", $line); next } # subfields can't be non-alphanumeric if ($line =~ /\n"; print $EXMARC "\n"; =head2 edit Handles the Term::ReadLine loop =cut sub edit { my ($msg, $line_in) = @_; print $OUT "\r".$msg, " at line $count:\n"; print_context(); while (1) { my $line = $term->readline('marc-cleanup>'); if (length $line < 2) { next unless (defined $commands{$line}) } if (defined $commands{$line}) { my $term = $commands{$line}->($line_in); last if $term; } else { if ($linecontext[3] eq " [LINE KILLED]\n") { push @record, "$line\n" } else { $record[-1] = "$line\n"; } $linecontext[3] = "$line\n"; print_linecontext(); } } } =head2 getline Reads from the incoming MARC file; returns lines into the driver loop. Batches records for output, and maintains the linecontext listing. =cut sub getline { my $l = ; $count++; if (defined $l) { if ($l =~ //) { @record = ($l); %reccontext = (); $reccount++; } elsif ($l =~ m||) { push @record, $l; write_record($NUMARC) if $reccount; } else { push @record, $l; } } return $l; } sub write_record { my ($FH) = @_; print $FH '',"\n"; print $FH @record; print $FH "\n"; } sub update_linecontext { my $line2 = ; push @linecontext, $line2; shift @linecontext if (@linecontext > 5); } #----------------------------------------------------------------------------------- # command routines #----------------------------------------------------------------------------------- sub print_context { print "\n Tag:",$reccontext{tag}, " Ind1:'", $reccontext{ind1},"' Ind2:'", $reccontext{ind2}, "'"; print_linecontext(); return 0; } sub print_linecontext { print $OUT "\n", join(' |','',@linecontext[0..2]); print $OUT '==> |', $linecontext[3]; print $OUT ' |', $linecontext[4],"\n"; return 0; } sub show_original { my ($line_in) = @_; print $OUT "\n$line_in\n"; return 0; } sub merge_lines { my $last = pop @record; $last =~ s/^\s+//; $record[-1] =~ s/\n//; $record[-1] = join('', $record[-1], $last); my @temp = ("\n"); push @temp, @linecontext[0..1]; $temp[3] = $record[-1]; $temp[4] = $linecontext[4]; @linecontext = @temp; print_linecontext(); return 0; } sub kill_line { pop @record; $linecontext[3] = " [LINE KILLED]\n"; print_linecontext(); return 0; } sub dump_record { my $line = ; $count++; update_linecontext(); until ($line =~ m||) { push @record, $line; $line = ; $count++; update_linecontext(); } push @record, $line; write_record($EXMARC); return 1; } sub commit_edit { return 1 } sub help { print $OUT <