my $term = new Term::ReadLine 'yaz-cleanup';
my $OUT = $term->OUT || \*STDOUT;
-print $OUT "\nInteractive MARC Stream Editor starting up...\n";
my $count = 0;
my $reccount = 0;
my $line = '';
my @record = (); # current record storage
-my @context= (); # last 5 lines of file
+my %reccontext = ();
+my @linecontext= (); # last 5 lines of file
-open MARC, '<', 'incoming.marc.xml';
+my $input = shift || 'incoming.marc.xml';
+
+open MARC, '<', $input;
open my $NUMARC, '>', 'incoming.clean.marc.xml';
print $NUMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
open my $EXMARC, '>', 'incoming.exceptions.marc.xml';
-open MARC2, '<', 'incoming.marc.xml';
+print $EXMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
+open MARC2, '<', $input;
<MARC2>;
# this is the dispatch table which drives command selection in
# edit(), below
my %commands = ( c => \&print_context,
+ C => \&print_linecontext,
k => \&kill_line,
o => \&show_original,
+ m => \&merge_lines,
t => \&commit_edit,
x => \&dump_record,
q => \&quit,
while (my $line = getline()) {
unless ($count % 2000) {
print "\rWorking... ", $spinner[$sidx];
- $sidx++;
- $sidx = 0 if ($sidx > $#spinner);
+ $sidx = $sidx > $#spinner ? 0 : $sidx++;
}
- update_context();
+ update_linecontext();
# catch empty datafield elements
if ($line =~ m|</datafield>|) {
if ($record[-2] =~ m/<datafield tag="..." ind1="." ind2=".">/) {
pop @record; pop @record;
- print $OUT "Empty datafield scrubbed at line $count\n";
+ print $OUT "\rEmpty datafield scrubbed at line $count\n";
next;
}
}
# clean misplaced dollarsigns
if ($line =~ m|<subfield code="\$">c?\d+\.\d{2}|) {
$line =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|;
- print $OUT "Dollar sign in subfield code corrected at line $count\n";
+ print $OUT "\rDollar sign in subfield code corrected at line $count\n";
}
# clean up tags with spaces in them
$line =~ s/tag="-/tag="0/g;
$line =~ s/tag="(\d\d) /tag="0$1/g;
+ # stow tag data if we're looking at it
+ if ($line =~ m/<datafield tag="(\d{3})" ind1="(.)" ind2="(.)">/) {
+ $reccontext{tag} = $1;
+ $reccontext{ind1} = $2;
+ $reccontext{ind2} = $3;
+ }
+
# naked ampersands
- if ($line =~ /&/ && $line !~ /&\w{1,7};/)
+ if ($line =~ /&/ && $line !~ /&\w+?;/)
{ edit("Looks like naked ampersand", $line); next }
# subfields can't be non-alphanumeric
- if ($line =~ /<subfield code="[^[:alnum:]]"/)
- { edit("Junk in subfield", $line); next }
+ if ($line =~ /<subfield code="(.+?)"/) {
+ my $match = $1;
+ if ($match =~ /\P{IsAlnum}/) {
+ print $OUT "\n$match\n";
+ edit("Junk in subfield", $line);
+ next;
+ }
+ }
}
print $NUMARC "</xml>\n";
+print $EXMARC "</xml>\n";
=head2 edit
sub edit {
my ($msg, $line_in) = @_;
- print $OUT "\n".$msg, " at line $count:\n";
+ print $OUT "\r".$msg, " at line $count:\n";
print_context();
while (1) {
my $line = $term->readline('yaz-cleanup>');
+ if (length $line < 2)
+ { next unless (defined $commands{$line}) }
if (defined $commands{$line}) {
my $term = $commands{$line}->($line_in);
last if $term;
} else {
- push @record, $record[-1] if ($context[3] eq " [LINE KILLED\n]");
- $record[-2] = "$line\n";
- $context[3] = "$line\n";
- print_context();
+ if ($linecontext[3] eq " [LINE KILLED]\n") {
+ push @record, "$line\n"
+ } else {
+ $record[-1] = "$line\n";
+ }
+ $linecontext[3] = "$line\n";
+ print_linecontext();
}
}
}
=head2 getline
Reads from the incoming MARC file; returns lines into the driver
-loop. Batches records for output, and maintains the context listing.
+loop. Batches records for output, and maintains the linecontext listing.
=cut
if (defined $l) {
if ($l =~ /<record>/) {
@record = ($l);
+ %reccontext = ();
$reccount++;
} elsif ($l =~ m|</record>|) {
+ push @record, $l;
write_record($NUMARC) if $reccount;
} else {
push @record, $l;
print $FH "</collection>\n";
}
-sub update_context {
+sub update_linecontext {
my $line2 = <MARC2>;
- push @context, $line2;
- shift @context if (@context > 5);
+ push @linecontext, $line2;
+ shift @linecontext if (@linecontext > 5);
}
#-----------------------------------------------------------------------------------
#-----------------------------------------------------------------------------------
sub print_context {
- print $OUT "\n", join(' |','',@context[0..2]);
- print $OUT '==> |', $context[3];
- print $OUT ' |', $context[4],"\n";
+ print "\n Tag:",$reccontext{tag}, " Ind1:'",
+ $reccontext{ind1},"' Ind2:'", $reccontext{ind2}, "'";
+ print_linecontext();
+ return 0;
+}
+
+sub print_linecontext {
+ print $OUT "\n", join(' |','',@linecontext[0..2]);
+ print $OUT '==> |', $linecontext[3];
+ print $OUT ' |', $linecontext[4],"\n";
return 0;
}
return 0;
}
-sub commit_edit { return 1 }
+sub merge_lines {
+ my $last = pop @record;
+ $last =~ s/^\s+//;
+ $record[-1] =~ s/\n//;
+ $record[-1] = join('', $record[-1], $last);
+ my @temp = ("\n");
+ push @temp, @linecontext[0..1];
+ $temp[3] = $record[-1];
+ $temp[4] = $linecontext[4];
+ @linecontext = @temp;
+ print_linecontext();
+ return 0;
+}
sub kill_line {
- my $tmp = pop @record;
pop @record;
- push @record, $tmp;
- $context[3] = " [LINE KILLED]\n";
- print_context();
+ $linecontext[3] = " [LINE KILLED]\n";
+ print_linecontext();
return 0;
}
sub dump_record {
- my $line = <MARC>;
+ my $line = <MARC>; $count++;
+ update_linecontext();
until ($line =~ m|</record>|) {
push @record, $line;
- $line = <MARC>;
- update_context;
+ $line = <MARC>; $count++;
+ update_linecontext();
}
push @record, $line;
write_record($EXMARC);
+ return 1;
}
+sub commit_edit { return 1 }
+
sub help {
print $OUT <<HELP;
Type a replacement for the indicated line, or enter a command.
-Commands: c Show line context
+Commands: c Show record context ('C' for brief context)
k Kill this line (remove from record)
+ m Merge indicated line with previous line
o Show original line
t Commit changes and resume stream edit
x Write this record to the exception file instead of output