11 my $term = new Term::ReadLine 'yaz-cleanup';
12 my $OUT = $term->OUT || \*STDOUT;
18 my @record = (); # current record storage
20 my @linecontext= (); # last 5 lines of file
22 my $input = shift || 'incoming.marc.xml';
24 open MARC, '<', $input;
25 open my $NUMARC, '>', 'incoming.clean.marc.xml';
26 print $NUMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
27 print $NUMARC '<collection xmlns="http://www.loc.gov/MARC21/slim">',"\n";
29 open my $EXMARC, '>', 'incoming.exceptions.marc.xml';
30 print $EXMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
31 print $EXMARC '<collection xmlns="http://www.loc.gov/MARC21/slim">',"\n";
32 open MARC2, '<', $input;
35 # this is the dispatch table which drives command selection in
37 my %commands = ( c => \&print_context,
38 C => \&print_linecontext,
51 my @spinner = qw(- / | \\);
54 while (my $line = getline()) {
55 unless ($count % 2000) {
56 print "\rWorking... ", $spinner[$sidx];
57 $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1;
61 # catch empty datafield elements
62 if ($line =~ m|</datafield>|) {
63 if ($record[-2] =~ m/<datafield tag="..." ind1="." ind2=".">/) {
64 pop @record; pop @record;
65 print $OUT "\rEmpty datafield scrubbed at line $count\n";
70 # clean misplaced dollarsigns
71 if ($line =~ m|<subfield code="\$">c?\d+\.\d{2}|) {
72 $line =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|;
73 print $OUT "\rDollar sign in subfield code corrected at line $count\n";
76 # clean up tags with spaces in them
77 $line =~ s/tag=" /tag="00/g;
78 $line =~ s/tag=" /tag="0/g;
79 $line =~ s/tag="-/tag="0/g;
80 $line =~ s/tag="(\d\d) /tag="0$1/g;
82 # stow tag data if we're looking at it
83 if ($line =~ m/<datafield tag="(.{3})" ind1="(.)" ind2="(.)">/) {
84 $reccontext{tag} = $1;
85 $reccontext{ind1} = $2;
86 $reccontext{ind2} = $3;
90 if ($line =~ /&/ && $line !~ /&\w+?;/)
91 { edit("Naked ampersand", $line); next }
93 # tags must be numeric
94 if ($line =~ /<datafield tag="(.+?)"/) {
97 edit("Non-numerics in tag", $line);
102 # subfields can't be non-alphanumeric
103 if ($line =~ /<subfield code="(.+?)"/) {
105 if ($match =~ /\P{IsAlnum}/) {
106 edit("Junk in subfield code", $line);
112 print $NUMARC "</collection>\n";
113 print $EXMARC "</collection>\n";
114 print $OUT "\nDone. \n";
118 Handles the Term::ReadLine loop
123 my ($msg, $line_in) = @_;
124 print $OUT "\r".$msg, " at line $count (record $reccount)\n";
128 my $line = $term->readline('marc-cleanup>');
129 my @chunks = split /\s+/, $line;
131 if (length $chunks[0] == 1)
132 { next unless (defined $commands{$chunks[0]}) }
134 if (defined $commands{$chunks[0]}) {
135 my $term = $commands{$chunks[0]}->($line_in, @chunks[1..$#chunks]);
138 if ($linecontext[3] eq " [LINE KILLED]\n") {
139 push @record, "$line\n"
141 $record[-1] = "$line\n";
143 $linecontext[3] = "$line\n";
151 Reads from the incoming MARC file; returns lines into the driver
152 loop. Batches records for output, and maintains the linecontext listing.
160 if ($l =~ /<record>/) {
164 } elsif ($l =~ m|</record>|) {
166 write_record($NUMARC) if $reccount;
176 print $FH '<!-- ', $reccontext{explanation}, " -->\n"
177 if(defined $reccontext{explanation});
181 sub update_linecontext {
183 push @linecontext, $line2;
184 shift @linecontext if (@linecontext > 5);
187 #-----------------------------------------------------------------------------------
189 #-----------------------------------------------------------------------------------
192 my ($line_in, $ofrom, $to) = @_;
194 for my $char (split(//,$ofrom)) {
195 $char = "\\" . $char if ($char =~ /\W/);
196 $from = join('', $from, $char);
198 $record[-1] =~ s/$from/$to/;
199 $linecontext[3] = $record[-1];
205 my $last = pop @record;
207 $record[-1] =~ s/\n//;
208 $record[-1] = join('', $record[-1], $last);
210 push @temp, @linecontext[0..1];
211 $temp[3] = $record[-1];
212 $temp[4] = $linecontext[4];
213 @linecontext = @temp;
220 $linecontext[3] = " [LINE KILLED]\n";
226 my ($line_in, @explanation) = @_;
227 $reccontext{explanation} = join(' ', @explanation);
228 my $line = <MARC>; $count++;
229 update_linecontext();
230 until ($line =~ m|</record>|) {
232 $line = <MARC>; $count++;
233 update_linecontext();
236 write_record($EXMARC);
240 sub commit_edit { return 1 }
243 print "\n Tag:",$reccontext{tag}, " Ind1:'",
244 $reccontext{ind1},"' Ind2:'", $reccontext{ind2}, "'";
249 sub print_linecontext {
250 print $OUT "\n", join(' |','',@linecontext[0..2]);
251 print $OUT '==> |', $linecontext[3];
252 print $OUT ' |', $linecontext[4],"\n";
258 print $OUT "\n$line_in\n";
265 Type a replacement for the indicated line, or enter a command.
267 Commands: c Show record context ('C' for brief context)
268 k Kill indicated line (remove from record)
269 m Merge indicated line with previous line
271 s Substitute ARG1 for ARG2 in indicated line
272 t Commit changes and resume stream edit
273 x Write this record to the exception file instead of output