#!/usr/bin/perl
-open MARC, '<', 'incoming.marc.xml';
-open NUMARC, '>', 'incoming.clean.marc.xml';
+use strict;
+use warnings;
-$line1 = <MARC>;
+use Getopt::Long;
+use Term::ReadLine;
-while ($line2 = <MARC>) {
- if ($line1 =~ m/<datafield tag="..." ind1=" " ind2=" ">/) {
- if ($line2 =~ m|</datafield>|) {
- $line1 = $line2;
+$| = 1;
+
+my $term = new Term::ReadLine 'yaz-cleanup';
+my $OUT = $term->OUT || \*STDOUT;
+
+my $count = 0;
+my $reccount = 0;
+my $line = '';
+
+my @record = (); # current record storage
+my %reccontext = ();
+my @linecontext= (); # last 5 lines of file
+
+my $input = shift || 'incoming.marc.xml';
+
+open MARC, '<', $input;
+open my $NUMARC, '>', 'incoming.clean.marc.xml';
+print $NUMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
+open my $EXMARC, '>', 'incoming.exceptions.marc.xml';
+print $EXMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
+open MARC2, '<', $input;
+<MARC2>;
+
+# this is the dispatch table which drives command selection in
+# edit(), below
+my %commands = ( c => \&print_context,
+ C => \&print_linecontext,
+ k => \&kill_line,
+ o => \&show_original,
+ m => \&merge_lines,
+ t => \&commit_edit,
+ x => \&dump_record,
+ q => \&quit,
+ '?' => \&help,
+ h => \&help,
+ help => \&help,
+ );
+
+my @spinner = qw(- / | \\);
+my $sidx = 0;
+
+while (my $line = getline()) {
+ unless ($count % 2000) {
+ print "\rWorking... ", $spinner[$sidx];
+ $sidx = $sidx > $#spinner ? 0 : $sidx++;
+ }
+ update_linecontext();
+
+ # catch empty datafield elements
+ if ($line =~ m|</datafield>|) {
+ if ($record[-2] =~ m/<datafield tag="..." ind1="." ind2=".">/) {
+ pop @record; pop @record;
+ print $OUT "\rEmpty datafield scrubbed at line $count\n";
+ next;
+ }
+ }
+
+ # clean misplaced dollarsigns
+ if ($line =~ m|<subfield code="\$">c?\d+\.\d{2}|) {
+ $line =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|;
+ print $OUT "\rDollar sign in subfield code corrected at line $count\n";
+ }
+
+ # clean up tags with spaces in them
+ $line =~ s/tag=" /tag="00/g;
+ $line =~ s/tag=" /tag="0/g;
+ $line =~ s/tag="-/tag="0/g;
+ $line =~ s/tag="(\d\d) /tag="0$1/g;
+
+ # stow tag data if we're looking at it
+ if ($line =~ m/<datafield tag="(\d{3})" ind1="(.)" ind2="(.)">/) {
+ $reccontext{tag} = $1;
+ $reccontext{ind1} = $2;
+ $reccontext{ind2} = $3;
+ }
+
+ # naked ampersands
+ if ($line =~ /&/ && $line !~ /&\w+?;/)
+ { edit("Looks like naked ampersand", $line); next }
+
+ # subfields can't be non-alphanumeric
+ if ($line =~ /<subfield code="(.+?)"/) {
+ my $match = $1;
+ if ($match =~ /\P{IsAlnum}/) {
+ print $OUT "\n$match\n";
+ edit("Junk in subfield", $line);
next;
}
}
- $line1 =~ s/tag=" /tag="00/g;
- $line1 =~ s/tag=" /tag="0/g;
- $line1 =~ s/tag="-/tag="0/g;
- $line1 =~ s/tag="(\d\d) /tag="0$1/g;
- print NUMARC $line1;
- $line1 = $line2;
+
+}
+print $NUMARC "</xml>\n";
+print $EXMARC "</xml>\n";
+
+=head2 edit
+
+Handles the Term::ReadLine loop
+
+=cut
+
+sub edit {
+ my ($msg, $line_in) = @_;
+ print $OUT "\r".$msg, " at line $count:\n";
+ print_context();
+ while (1) {
+ my $line = $term->readline('yaz-cleanup>');
+ if (length $line < 2)
+ { next unless (defined $commands{$line}) }
+ if (defined $commands{$line}) {
+ my $term = $commands{$line}->($line_in);
+ last if $term;
+ } else {
+ if ($linecontext[3] eq " [LINE KILLED]\n") {
+ push @record, "$line\n"
+ } else {
+ $record[-1] = "$line\n";
+ }
+ $linecontext[3] = "$line\n";
+ print_linecontext();
+ }
+ }
+}
+
+=head2 getline
+
+Reads from the incoming MARC file; returns lines into the driver
+loop. Batches records for output, and maintains the linecontext listing.
+
+=cut
+
+sub getline {
+ my $l = <MARC>;
+ $count++;
+ if (defined $l) {
+ if ($l =~ /<record>/) {
+ @record = ($l);
+ %reccontext = ();
+ $reccount++;
+ } elsif ($l =~ m|</record>|) {
+ push @record, $l;
+ write_record($NUMARC) if $reccount;
+ } else {
+ push @record, $l;
+ }
+ }
+ return $l;
+}
+
+sub write_record {
+ my ($FH) = @_;
+ print $FH '<collection xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" xmlns="http://www.loc.gov/MARC21/slim">',"\n";
+ print $FH @record;
+ print $FH "</collection>\n";
+}
+
+sub update_linecontext {
+ my $line2 = <MARC2>;
+ push @linecontext, $line2;
+ shift @linecontext if (@linecontext > 5);
+}
+
+#-----------------------------------------------------------------------------------
+# command routines
+#-----------------------------------------------------------------------------------
+
+sub print_context {
+ print "\n Tag:",$reccontext{tag}, " Ind1:'",
+ $reccontext{ind1},"' Ind2:'", $reccontext{ind2}, "'";
+ print_linecontext();
+ return 0;
+}
+
+sub print_linecontext {
+ print $OUT "\n", join(' |','',@linecontext[0..2]);
+ print $OUT '==> |', $linecontext[3];
+ print $OUT ' |', $linecontext[4],"\n";
+ return 0;
+}
+
+sub show_original {
+ my ($line_in) = @_;
+ print $OUT "\n$line_in\n";
+ return 0;
+}
+
+sub merge_lines {
+ my $last = pop @record;
+ $last =~ s/^\s+//;
+ $record[-1] =~ s/\n//;
+ $record[-1] = join('', $record[-1], $last);
+ my @temp = ("\n");
+ push @temp, @linecontext[0..1];
+ $temp[3] = $record[-1];
+ $temp[4] = $linecontext[4];
+ @linecontext = @temp;
+ print_linecontext();
+ return 0;
+}
+
+sub kill_line {
+ pop @record;
+ $linecontext[3] = " [LINE KILLED]\n";
+ print_linecontext();
+ return 0;
+}
+
+sub dump_record {
+ my $line = <MARC>; $count++;
+ update_linecontext();
+ until ($line =~ m|</record>|) {
+ push @record, $line;
+ $line = <MARC>; $count++;
+ update_linecontext();
+ }
+ push @record, $line;
+ write_record($EXMARC);
+ return 1;
}
-print NUMARC $line1;
+
+sub commit_edit { return 1 }
+
+sub help {
+print $OUT <<HELP;
+
+Type a replacement for the indicated line, or enter a command.
+
+Commands: c Show record context ('C' for brief context)
+ k Kill this line (remove from record)
+ m Merge indicated line with previous line
+ o Show original line
+ t Commit changes and resume stream edit
+ x Write this record to the exception file instead of output
+ q Quit
+
+HELP
+return 0;
+}
+
+sub quit { exit }