11 my $term = new Term::ReadLine 'yaz-cleanup';
12 my $OUT = $term->OUT || \*STDOUT;
18 my @record = (); # current record storage
20 my @linecontext= (); # last 5 lines of file
22 open MARC, '<', 'incoming.marc.xml';
23 open my $NUMARC, '>', 'incoming.clean.marc.xml';
24 print $NUMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
25 open my $EXMARC, '>', 'incoming.exceptions.marc.xml';
26 print $EXMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
27 open MARC2, '<', 'incoming.marc.xml';
30 # this is the dispatch table which drives command selection in
32 my %commands = ( c => \&print_context,
33 C => \&print_linecontext,
45 my @spinner = qw(- / | \\);
48 while (my $line = getline()) {
49 unless ($count % 2000) {
50 print "\rWorking... ", $spinner[$sidx];
51 $sidx = $sidx > $#spinner ? 0 : $sidx++;
55 # catch empty datafield elements
56 if ($line =~ m|</datafield>|) {
57 if ($record[-2] =~ m/<datafield tag="..." ind1="." ind2=".">/) {
58 pop @record; pop @record;
59 print $OUT "\rEmpty datafield scrubbed at line $count\n";
64 # clean misplaced dollarsigns
65 if ($line =~ m|<subfield code="\$">c?\d+\.\d{2}|) {
66 $line =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|;
67 print $OUT "\rDollar sign in subfield code corrected at line $count\n";
70 # clean up tags with spaces in them
71 $line =~ s/tag=" /tag="00/g;
72 $line =~ s/tag=" /tag="0/g;
73 $line =~ s/tag="-/tag="0/g;
74 $line =~ s/tag="(\d\d) /tag="0$1/g;
76 # stow tag data if we're looking at it
77 if ($line =~ m/<datafield tag="(\d{3})" ind1="(.)" ind2="(.)">/) {
78 $reccontext{tag} = $1;
79 $reccontext{ind1} = $2;
80 $reccontext{ind2} = $3;
84 if ($line =~ /&/ && $line !~ /&\w{1,7};/)
85 { edit("Looks like naked ampersand", $line); next }
87 # subfields can't be non-alphanumeric
88 if ($line =~ /<subfield code="[^[:alnum:]]"/)
89 { edit("Junk in subfield", $line); next }
92 print $NUMARC "</xml>\n";
93 print $EXMARC "</xml>\n";
97 Handles the Term::ReadLine loop
102 my ($msg, $line_in) = @_;
103 print $OUT "\r".$msg, " at line $count:\n";
106 my $line = $term->readline('yaz-cleanup>');
107 if (length $line < 2)
108 { next unless (defined $commands{$line}) }
109 if (defined $commands{$line}) {
110 my $term = $commands{$line}->($line_in);
113 if ($linecontext[3] eq " [LINE KILLED]\n") {
114 push @record, "$line\n"
116 $record[-1] = "$line\n";
118 $linecontext[3] = "$line\n";
126 Reads from the incoming MARC file; returns lines into the driver
127 loop. Batches records for output, and maintains the linecontext listing.
135 if ($l =~ /<record>/) {
139 } elsif ($l =~ m|</record>|) {
140 write_record($NUMARC) if $reccount;
150 print $FH '<collection xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd" xmlns="http://www.loc.gov/MARC21/slim">',"\n";
152 print $FH "</collection>\n";
155 sub update_linecontext {
157 push @linecontext, $line2;
158 shift @linecontext if (@linecontext > 5);
161 #-----------------------------------------------------------------------------------
163 #-----------------------------------------------------------------------------------
166 print "\n Tag:",$reccontext{tag}, " Ind1:'",
167 $reccontext{ind1},"' Ind2:'", $reccontext{ind2}, "'";
172 sub print_linecontext {
173 print $OUT "\n", join(' |','',@linecontext[0..2]);
174 print $OUT '==> |', $linecontext[3];
175 print $OUT ' |', $linecontext[4],"\n";
181 print $OUT "\n$line_in\n";
186 my $last = pop @record;
188 $record[-1] =~ s/\n//;
189 $record[-1] = join('', $record[-1], $last);
191 push @temp, @linecontext[0..1];
192 $temp[3] = $record[-1];
193 $temp[4] = $linecontext[4];
194 @linecontext = @temp;
201 $linecontext[3] = " [LINE KILLED]\n";
207 my $line = <MARC>; $count++;
208 update_linecontext();
209 until ($line =~ m|</record>|) {
211 $line = <MARC>; $count++;
212 update_linecontext();
215 write_record($EXMARC);
219 sub commit_edit { return 1 }
224 Type a replacement for the indicated line, or enter a command.
226 Commands: c Show record context ('C' for brief context)
227 k Kill this line (remove from record)
228 m Merge indicated line with previous line
230 t Commit changes and resume stream edit
231 x Write this record to the exception file instead of output