11 my $term = new Term::ReadLine 'yaz-cleanup';
12 my $OUT = $term->OUT || \*STDOUT;
18 my @record = (); # current record storage
19 my @context= (); # last 5 lines of file
21 open MARC, '<', 'incoming.marc.xml';
22 open my $NUMARC, '>', 'incoming.clean.marc.xml';
23 print $NUMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
24 open my $EXMARC, '>', 'incoming.exceptions.marc.xml';
25 print $EXMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
26 open MARC2, '<', 'incoming.marc.xml';
29 # this is the dispatch table which drives command selection in
31 my %commands = ( c => \&print_context,
42 my @spinner = qw(- / | \\);
45 while (my $line = getline()) {
46 unless ($count % 2000) {
47 print "\rWorking... ", $spinner[$sidx];
49 $sidx = 0 if ($sidx > $#spinner);
53 # catch empty datafield elements
54 if ($line =~ m|</datafield>|) {
55 if ($record[-2] =~ m/<datafield tag="..." ind1="." ind2=".">/) {
56 pop @record; pop @record;
57 print $OUT "\rEmpty datafield scrubbed at line $count\n";
62 # clean misplaced dollarsigns
63 if ($line =~ m|<subfield code="\$">c?\d+\.\d{2}|) {
64 $line =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|;
65 print $OUT "\rDollar sign in subfield code corrected at line $count\n";
68 # clean up tags with spaces in them
69 $line =~ s/tag=" /tag="00/g;
70 $line =~ s/tag=" /tag="0/g;
71 $line =~ s/tag="-/tag="0/g;
72 $line =~ s/tag="(\d\d) /tag="0$1/g;
75 if ($line =~ /&/ && $line !~ /&\w{1,7};/)
76 { edit("Looks like naked ampersand", $line); next }
78 # subfields can't be non-alphanumeric
79 if ($line =~ /<subfield code="[^[:alnum:]]"/)
80 { edit("Junk in subfield", $line); next }
83 print $NUMARC "</xml>\n";
84 print $EXMARC "</xml>\n";
88 Handles the Term::ReadLine loop
93 my ($msg, $line_in) = @_;
94 print $OUT "\r".$msg, " at line $count:\n";
97 my $line = $term->readline('yaz-cleanup>');
98 if (defined $commands{$line}) {
99 my $term = $commands{$line}->($line_in);
102 if ($context[3] eq " [LINE KILLED]\n") {
103 push @record, "$line\n"
105 $record[-1] = "$line\n";
107 $context[3] = "$line\n";
115 Reads from the incoming MARC file; returns lines into the driver
116 loop. Batches records for output, and maintains the context listing.
124 if ($l =~ /<record>/) {
127 } elsif ($l =~ m|</record>|) {
128 write_record($NUMARC) if $reccount;
138 print $FH '<collection xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" xmlns="http://www.loc.gov/MARC21/slim">',"\n";
140 print $FH "</collection>\n";
145 push @context, $line2;
146 shift @context if (@context > 5);
149 #-----------------------------------------------------------------------------------
151 #-----------------------------------------------------------------------------------
154 print $OUT "\n", join(' |','',@context[0..2]);
155 print $OUT '==> |', $context[3];
156 print $OUT ' |', $context[4],"\n";
162 print $OUT "\n$line_in\n";
166 sub commit_edit { return 1 }
170 $context[3] = " [LINE KILLED]\n";
176 my $line = <MARC>; $count++;
178 until ($line =~ m|</record>|) {
180 $line = <MARC>; $count++;
184 write_record($EXMARC);
190 Type a replacement for the indicated line, or enter a command.
192 Commands: c Show line context
193 k Kill this line (remove from record)
195 t Commit changes and resume stream edit
196 x Write this record to the exception file instead of output