From 5b507cf3b85bdfde3eb8969d8de9f0b3ed326af5 Mon Sep 17 00:00:00 2001 From: Shawn Boyette Date: Tue, 23 Sep 2008 19:04:24 +0000 Subject: [PATCH] rename --- marc-cleanup | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ yaz-cleanup | 247 ---------------------------------------------------------- 2 files changed, 247 insertions(+), 247 deletions(-) create mode 100755 marc-cleanup delete mode 100755 yaz-cleanup diff --git a/marc-cleanup b/marc-cleanup new file mode 100755 index 0000000..998bfa5 --- /dev/null +++ b/marc-cleanup @@ -0,0 +1,247 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use Getopt::Long; +use Term::ReadLine; + +$| = 1; + +my $term = new Term::ReadLine 'yaz-cleanup'; +my $OUT = $term->OUT || \*STDOUT; + +my $count = 0; +my $reccount = 0; +my $line = ''; + +my @record = (); # current record storage +my %reccontext = (); +my @linecontext= (); # last 5 lines of file + +my $input = shift || 'incoming.marc.xml'; + +open MARC, '<', $input; +open my $NUMARC, '>', 'incoming.clean.marc.xml'; +print $NUMARC '',"\n"; +open my $EXMARC, '>', 'incoming.exceptions.marc.xml'; +print $EXMARC '',"\n"; +open MARC2, '<', $input; +; + +# this is the dispatch table which drives command selection in +# edit(), below +my %commands = ( c => \&print_context, + C => \&print_linecontext, + k => \&kill_line, + o => \&show_original, + m => \&merge_lines, + t => \&commit_edit, + x => \&dump_record, + q => \&quit, + '?' => \&help, + h => \&help, + help => \&help, + ); + +my @spinner = qw(- / | \\); +my $sidx = 0; + +while (my $line = getline()) { + unless ($count % 2000) { + print "\rWorking... ", $spinner[$sidx]; + $sidx = $sidx > $#spinner ? 0 : $sidx++; + } + update_linecontext(); + + # catch empty datafield elements + if ($line =~ m||) { + if ($record[-2] =~ m//) { + pop @record; pop @record; + print $OUT "\rEmpty datafield scrubbed at line $count\n"; + next; + } + } + + # clean misplaced dollarsigns + if ($line =~ m|c?\d+\.\d{2}|) { + $line =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|; + print $OUT "\rDollar sign in subfield code corrected at line $count\n"; + } + + # clean up tags with spaces in them + $line =~ s/tag=" /tag="00/g; + $line =~ s/tag=" /tag="0/g; + $line =~ s/tag="-/tag="0/g; + $line =~ s/tag="(\d\d) /tag="0$1/g; + + # stow tag data if we're looking at it + if ($line =~ m//) { + $reccontext{tag} = $1; + $reccontext{ind1} = $2; + $reccontext{ind2} = $3; + } + + # naked ampersands + if ($line =~ /&/ && $line !~ /&\w+?;/) + { edit("Looks like naked ampersand", $line); next } + + # subfields can't be non-alphanumeric + if ($line =~ /\n"; +print $EXMARC "\n"; + +=head2 edit + +Handles the Term::ReadLine loop + +=cut + +sub edit { + my ($msg, $line_in) = @_; + print $OUT "\r".$msg, " at line $count:\n"; + print_context(); + while (1) { + my $line = $term->readline('yaz-cleanup>'); + if (length $line < 2) + { next unless (defined $commands{$line}) } + if (defined $commands{$line}) { + my $term = $commands{$line}->($line_in); + last if $term; + } else { + if ($linecontext[3] eq " [LINE KILLED]\n") { + push @record, "$line\n" + } else { + $record[-1] = "$line\n"; + } + $linecontext[3] = "$line\n"; + print_linecontext(); + } + } +} + +=head2 getline + +Reads from the incoming MARC file; returns lines into the driver +loop. Batches records for output, and maintains the linecontext listing. + +=cut + +sub getline { + my $l = ; + $count++; + if (defined $l) { + if ($l =~ //) { + @record = ($l); + %reccontext = (); + $reccount++; + } elsif ($l =~ m||) { + push @record, $l; + write_record($NUMARC) if $reccount; + } else { + push @record, $l; + } + } + return $l; +} + +sub write_record { + my ($FH) = @_; + print $FH '',"\n"; + print $FH @record; + print $FH "\n"; +} + +sub update_linecontext { + my $line2 = ; + push @linecontext, $line2; + shift @linecontext if (@linecontext > 5); +} + +#----------------------------------------------------------------------------------- +# command routines +#----------------------------------------------------------------------------------- + +sub print_context { + print "\n Tag:",$reccontext{tag}, " Ind1:'", + $reccontext{ind1},"' Ind2:'", $reccontext{ind2}, "'"; + print_linecontext(); + return 0; +} + +sub print_linecontext { + print $OUT "\n", join(' |','',@linecontext[0..2]); + print $OUT '==> |', $linecontext[3]; + print $OUT ' |', $linecontext[4],"\n"; + return 0; +} + +sub show_original { + my ($line_in) = @_; + print $OUT "\n$line_in\n"; + return 0; +} + +sub merge_lines { + my $last = pop @record; + $last =~ s/^\s+//; + $record[-1] =~ s/\n//; + $record[-1] = join('', $record[-1], $last); + my @temp = ("\n"); + push @temp, @linecontext[0..1]; + $temp[3] = $record[-1]; + $temp[4] = $linecontext[4]; + @linecontext = @temp; + print_linecontext(); + return 0; +} + +sub kill_line { + pop @record; + $linecontext[3] = " [LINE KILLED]\n"; + print_linecontext(); + return 0; +} + +sub dump_record { + my $line = ; $count++; + update_linecontext(); + until ($line =~ m||) { + push @record, $line; + $line = ; $count++; + update_linecontext(); + } + push @record, $line; + write_record($EXMARC); + return 1; +} + +sub commit_edit { return 1 } + +sub help { +print $OUT <OUT || \*STDOUT; - -my $count = 0; -my $reccount = 0; -my $line = ''; - -my @record = (); # current record storage -my %reccontext = (); -my @linecontext= (); # last 5 lines of file - -my $input = shift || 'incoming.marc.xml'; - -open MARC, '<', $input; -open my $NUMARC, '>', 'incoming.clean.marc.xml'; -print $NUMARC '',"\n"; -open my $EXMARC, '>', 'incoming.exceptions.marc.xml'; -print $EXMARC '',"\n"; -open MARC2, '<', $input; -; - -# this is the dispatch table which drives command selection in -# edit(), below -my %commands = ( c => \&print_context, - C => \&print_linecontext, - k => \&kill_line, - o => \&show_original, - m => \&merge_lines, - t => \&commit_edit, - x => \&dump_record, - q => \&quit, - '?' => \&help, - h => \&help, - help => \&help, - ); - -my @spinner = qw(- / | \\); -my $sidx = 0; - -while (my $line = getline()) { - unless ($count % 2000) { - print "\rWorking... ", $spinner[$sidx]; - $sidx = $sidx > $#spinner ? 0 : $sidx++; - } - update_linecontext(); - - # catch empty datafield elements - if ($line =~ m||) { - if ($record[-2] =~ m//) { - pop @record; pop @record; - print $OUT "\rEmpty datafield scrubbed at line $count\n"; - next; - } - } - - # clean misplaced dollarsigns - if ($line =~ m|c?\d+\.\d{2}|) { - $line =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|; - print $OUT "\rDollar sign in subfield code corrected at line $count\n"; - } - - # clean up tags with spaces in them - $line =~ s/tag=" /tag="00/g; - $line =~ s/tag=" /tag="0/g; - $line =~ s/tag="-/tag="0/g; - $line =~ s/tag="(\d\d) /tag="0$1/g; - - # stow tag data if we're looking at it - if ($line =~ m//) { - $reccontext{tag} = $1; - $reccontext{ind1} = $2; - $reccontext{ind2} = $3; - } - - # naked ampersands - if ($line =~ /&/ && $line !~ /&\w+?;/) - { edit("Looks like naked ampersand", $line); next } - - # subfields can't be non-alphanumeric - if ($line =~ /\n"; -print $EXMARC "\n"; - -=head2 edit - -Handles the Term::ReadLine loop - -=cut - -sub edit { - my ($msg, $line_in) = @_; - print $OUT "\r".$msg, " at line $count:\n"; - print_context(); - while (1) { - my $line = $term->readline('yaz-cleanup>'); - if (length $line < 2) - { next unless (defined $commands{$line}) } - if (defined $commands{$line}) { - my $term = $commands{$line}->($line_in); - last if $term; - } else { - if ($linecontext[3] eq " [LINE KILLED]\n") { - push @record, "$line\n" - } else { - $record[-1] = "$line\n"; - } - $linecontext[3] = "$line\n"; - print_linecontext(); - } - } -} - -=head2 getline - -Reads from the incoming MARC file; returns lines into the driver -loop. Batches records for output, and maintains the linecontext listing. - -=cut - -sub getline { - my $l = ; - $count++; - if (defined $l) { - if ($l =~ //) { - @record = ($l); - %reccontext = (); - $reccount++; - } elsif ($l =~ m||) { - push @record, $l; - write_record($NUMARC) if $reccount; - } else { - push @record, $l; - } - } - return $l; -} - -sub write_record { - my ($FH) = @_; - print $FH '',"\n"; - print $FH @record; - print $FH "\n"; -} - -sub update_linecontext { - my $line2 = ; - push @linecontext, $line2; - shift @linecontext if (@linecontext > 5); -} - -#----------------------------------------------------------------------------------- -# command routines -#----------------------------------------------------------------------------------- - -sub print_context { - print "\n Tag:",$reccontext{tag}, " Ind1:'", - $reccontext{ind1},"' Ind2:'", $reccontext{ind2}, "'"; - print_linecontext(); - return 0; -} - -sub print_linecontext { - print $OUT "\n", join(' |','',@linecontext[0..2]); - print $OUT '==> |', $linecontext[3]; - print $OUT ' |', $linecontext[4],"\n"; - return 0; -} - -sub show_original { - my ($line_in) = @_; - print $OUT "\n$line_in\n"; - return 0; -} - -sub merge_lines { - my $last = pop @record; - $last =~ s/^\s+//; - $record[-1] =~ s/\n//; - $record[-1] = join('', $record[-1], $last); - my @temp = ("\n"); - push @temp, @linecontext[0..1]; - $temp[3] = $record[-1]; - $temp[4] = $linecontext[4]; - @linecontext = @temp; - print_linecontext(); - return 0; -} - -sub kill_line { - pop @record; - $linecontext[3] = " [LINE KILLED]\n"; - print_linecontext(); - return 0; -} - -sub dump_record { - my $line = ; $count++; - update_linecontext(); - until ($line =~ m||) { - push @record, $line; - $line = ; $count++; - update_linecontext(); - } - push @record, $line; - write_record($EXMARC); - return 1; -} - -sub commit_edit { return 1 } - -sub help { -print $OUT <