11 my $term = new Term::ReadLine 'yaz-cleanup';
12 my $OUT = $term->OUT || \*STDOUT;
19 my @record = (); # current record storage
21 my @linecontext= (); # last 5 lines of file
23 my $input = shift || 'incoming.marc.xml';
25 open MARC, '<', $input;
26 open my $NUMARC, '>', 'incoming.clean.marc.xml';
27 print $NUMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
28 print $NUMARC '<collection xmlns="http://www.loc.gov/MARC21/slim">',"\n";
30 open my $EXMARC, '>', 'incoming.exceptions.marc.xml';
31 print $EXMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
32 print $EXMARC '<collection xmlns="http://www.loc.gov/MARC21/slim">',"\n";
33 open MARC2, '<', $input;
36 # this is the dispatch table which drives command selection in
38 my %commands = ( c => \&print_context,
39 C => \&print_linecontext,
52 my @spinner = qw(- / | \\);
55 while (my $line = getline()) {
56 unless ($count % 2000) {
57 print "\rWorking... ", $spinner[$sidx];
58 $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1;
62 # catch empty datafield elements
63 if ($line =~ m|</datafield>|) {
64 if ($record[-2] =~ m/<datafield tag="..." ind1="." ind2=".">/) {
65 pop @record; pop @record;
66 message("Empty datafield scrubbed");
72 if ($line =~ m|<leader>(.+?)</leader>|) {
74 if (length $leader < 24) {
75 $leader .= ' ' x (20 - length($leader));
77 $line = "<leader>$leader</leader>\n";
78 message("Short leader padded");
82 # clean misplaced dollarsigns
83 if ($line =~ m|<subfield code="\$">c?\d+\.\d{2}|) {
84 $line =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|;
85 message("Dollar sign corrected");
88 # clean up tags with spaces in them
89 $line =~ s/tag=" /tag="00/g;
90 $line =~ s/tag=" /tag="0/g;
91 $line =~ s/tag="-/tag="0/g;
92 $line =~ s/tag="(\d\d) /tag="0$1/g;
94 # stow tag data if we're looking at it
95 if ($line =~ m/<datafield tag="(.{3})" ind1="(.)" ind2="(.)">/) {
96 $reccontext{tag} = $1;
97 $reccontext{ind1} = $2;
98 $reccontext{ind2} = $3;
101 # and stow line back in record
105 if ($line =~ /&/ && $line !~ /&\w+?;/)
106 { edit("Naked ampersand", $line); next }
108 # tags must be numeric
109 if ($line =~ /<datafield tag="(.+?)"/) {
111 if ($match =~ /\D/) {
112 edit("Non-numerics in tag", $line);
117 # subfields can't be non-alphanumeric
118 if ($line =~ /<subfield code="(.+?)"/) {
120 if ($match =~ /\P{IsAlnum}/) {
121 edit("Junk in subfield code", $line);
127 print $NUMARC "</collection>\n";
128 print $EXMARC "</collection>\n";
129 print $OUT "\nDone. \n";
133 Handles the Term::ReadLine loop
138 my ($msg, $line_in) = @_;
143 my $line = $term->readline('marc-cleanup>');
144 my @chunks = split /\s+/, $line;
146 if (length $chunks[0] == 1)
147 { next unless (defined $commands{$chunks[0]}) }
149 if (defined $commands{$chunks[0]}) {
150 my $term = $commands{$chunks[0]}->($line_in, @chunks[1..$#chunks]);
153 if ($linecontext[3] eq " [LINE KILLED]\n") {
154 push @record, "$line\n"
156 $record[-1] = "$line\n";
158 $linecontext[3] = "$line\n";
166 Reads from the incoming MARC file; returns lines into the driver
167 loop. Batches records for output, and maintains the linecontext listing.
175 if ($l =~ /<record>/) {
179 } elsif ($l =~ m|</record>|) {
181 write_record($NUMARC) if $reccount;
191 $oreccount++ if ($FH eq $NUMARC);
192 print $FH '<!-- ', $reccontext{explanation}, " -->\n"
193 if(defined $reccontext{explanation});
197 sub update_linecontext {
199 push @linecontext, $line2;
200 shift @linecontext if (@linecontext > 5);
205 print $OUT "\r$msg at record $reccount/",$oreccount + 1,"\n";
209 #-----------------------------------------------------------------------------------
211 #-----------------------------------------------------------------------------------
214 my ($line_in, @chunks) = @_;
215 my $ofrom = shift @chunks;
216 if ($ofrom =~ /^'/ or !@chunks) {
217 until ($ofrom =~ /'$/)
218 { $ofrom .= join(' ','',shift @chunks) }
219 $ofrom =~ s/^'//; $ofrom =~ s/'$//;
221 my $to = shift @chunks;
223 until ($to =~ /'$/ or !@chunks)
224 { $to .= join(' ','',shift @chunks) }
225 $to =~ s/^'//; $to =~ s/'$//;
229 for my $char (split(//,$ofrom)) {
230 $char = "\\" . $char if ($char =~ /\W/);
231 $from = join('', $from, $char);
233 $record[-1] =~ s/$from/$to/;
234 $linecontext[3] = $record[-1];
240 my $last = pop @record;
242 $record[-1] =~ s/\n//;
243 $record[-1] = join('', $record[-1], $last);
245 push @temp, @linecontext[0..1];
246 $temp[3] = $record[-1];
247 $temp[4] = $linecontext[4];
248 @linecontext = @temp;
255 $linecontext[3] = " [LINE KILLED]\n";
261 my ($line_in, @explanation) = @_;
262 $reccontext{explanation} = join(' ', @explanation);
263 my $line = <MARC>; $count++;
264 update_linecontext();
265 until ($line =~ m|</record>|) {
267 $line = <MARC>; $count++;
268 update_linecontext();
271 write_record($EXMARC);
275 sub commit_edit { return 1 }
278 print "\n Tag:",$reccontext{tag}, " Ind1:'",
279 $reccontext{ind1},"' Ind2:'", $reccontext{ind2}, "'";
284 sub print_linecontext {
285 print $OUT "\n", join(' |','',@linecontext[0..2]);
286 print $OUT '==> |', $linecontext[3];
287 print $OUT ' |', $linecontext[4],"\n";
293 print $OUT "\n$line_in\n";
300 Type a replacement for the indicated line, or enter a command.
302 Commands: c Show record context ('C' for brief context)
303 k Kill indicated line (remove from record)
304 m Merge indicated line with previous line
306 s Substitute ARG1 for ARG2 in indicated line
307 t Commit changes and resume stream edit
308 x Write this record to the exception file instead of output