#!/usr/bin/perl use strict; use warnings; use Getopt::Long; use Term::ReadLine; $| = 1; my $term = new Term::ReadLine 'yaz-cleanup'; my $OUT = $term->OUT || \*STDOUT; my $conf = {}; my $count = 0; my $reccount = 0; my $oreccount = 0; my $line = ''; my %trash = (); # hash for tags to be dumped # initialization and setup initialize($conf); # read in trash tags file if it exists populate_trash() if ($conf->{trash}); my @record = (); # current record storage my %recmeta = (); # metadata about current record my @context= (); # last 5 lines of file my $input = shift || 'incoming.marc.xml'; open MARC, '<:utf8', $input; open my $NUMARC, '>:utf8', $conf->{output}; print $NUMARC '',"\n"; print $NUMARC '',"\n"; open my $EXMARC, '>:utf8', $conf->{exception}; print $EXMARC '',"\n"; print $EXMARC '',"\n"; open MARC2, '<', $input; ; # this is the dispatch table which drives command selection in # edit(), below my %commands = ( c => \&print_context, C => \&print_linecontext, k => \&kill_line, o => \&show_original, m => \&merge_lines, s => \&substitute, t => \&commit_edit, x => \&dump_record, q => \&quit, '?' => \&help, h => \&help, help => \&help, ); my @spinner = qw(- / | \\); my $sidx = 0; while (my $line = getline()) { unless ($count % 2000) { print "\rWorking... ", $spinner[$sidx]; $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1; } update_linecontext(); next if ($line =~ m||); # catch empty datafield elements if ($line =~ m||) { if ($record[-2] =~ m//) { pop @record; pop @record; message("Empty datafield scrubbed"); next; } } # pad short leaders if ($line =~ m|(.+?)|) { my $leader = $1; if (length $leader < 24) { $leader .= ' ' x (20 - length($leader)); $leader .= "4500"; $line = "$leader\n"; message("Short leader padded"); } } # clean misplaced dollarsigns if ($line =~ m|c?\d+\.\d{2}|) { $line =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|; message("Dollar sign corrected"); } # clean up tags with spaces in them $line =~ s/tag=" /tag="00/g; $line =~ s/tag=" /tag="0/g; $line =~ s/tag="-/tag="0/g; $line =~ s/tag="(\d\d) /tag="0$1/g; # stow tag data if we're looking at it if ($line =~ m//) { $recmeta{tag} = $1; $recmeta{ind1} = $2; $recmeta{ind2} = $3; } # automatable subfield maladies $line =~ s/code=" ">c/code="c">/; $line =~ s/code=" ">\$/code="c"$>/; # and stow line back in record $record[-1] = $line; # naked ampersands if ($line =~ /&/ && $line !~ /&\w+?;/) { edit("Naked ampersand", $line); next } # tags must be numeric if ($line =~ /\n"; print $EXMARC "\n"; print $OUT "\nDone. \n"; =head2 edit Handles the Term::ReadLine loop =cut sub edit { my ($msg, $line_in) = @_; return if $trash{$recmeta{tag}}; message($msg); print_context(); while (1) { my $line = $term->readline('marc-cleanup>'); my @chunks = split /\s+/, $line; if (length $chunks[0] == 1) { next unless (defined $commands{$chunks[0]}) } if (defined $commands{$chunks[0]}) { my $term = $commands{$chunks[0]}->($line_in, @chunks[1..$#chunks]); last if $term; } else { if ($context[3] eq " [LINE KILLED]\n") { push @record, "$line\n" } else { $record[-1] = "$line\n"; } $context[3] = "$line\n"; print_linecontext(); } } } =head2 getline Reads from the incoming MARC file; returns lines into the driver loop. Batches records for output, and maintains the context listing. =cut sub getline { my $l = ; $count++; if (defined $l) { if ($l =~ //) { @record = ($l); %recmeta = (); $reccount++; } elsif ($l =~ m||) { write_record($NUMARC) if $reccount; } else { push @record, $l; } } return $l; } sub write_record { my ($FH) = @_; $oreccount++ if ($FH eq $NUMARC); print $FH '\n" if(defined $recmeta{explanation}); # excise unwanted tags if (keys %trash or $conf->{autoscrub}) { my @trimmed = (); my $istrash = 0; for my $line (@record) { if ($istrash) { $istrash = 0 if $line =~ m|{autoscrub} and $tag =~ /\D/)) { $istrash = 1; next } } push @trimmed, $line; } @record = @trimmed; } # scrub newlines unless ($conf->{nocollapse}) { s/\n// for (@record); } # add 903(?) with new record id my $renumber = ''; if ($conf->{'renumber-from'}) { $renumber = join('', '', '', $conf->{'renumber-from'}, ''); $renumber .= "\n" if $conf->{nocollapse}; push @record, $renumber; $conf->{'renumber-from'}++; } print $FH @record; print $FH "\n"; } sub update_linecontext { my $line2 = ; push @context, $line2; shift @context if (@context > 5); } sub message { my ($msg) = @_; print $OUT "\r$msg at record $reccount/",$oreccount + 1,"\n"; } #----------------------------------------------------------------------------------- # command routines #----------------------------------------------------------------------------------- sub substitute { my ($line_in, @chunks) = @_; my $ofrom = shift @chunks; if ($ofrom =~ /^'/ or !@chunks) { until ($ofrom =~ /'$/) { $ofrom .= join(' ','',shift @chunks) } $ofrom =~ s/^'//; $ofrom =~ s/'$//; } my $to = shift @chunks; if ($to =~ /^'/) { until ($to =~ /'$/ or !@chunks) { $to .= join(' ','',shift @chunks) } $to =~ s/^'//; $to =~ s/'$//; } my $from = ''; for my $char (split(//,$ofrom)) { $char = "\\" . $char if ($char =~ /\W/); $from = join('', $from, $char); } $record[-1] =~ s/$from/$to/; $context[3] = $record[-1]; print_linecontext(); return 0; } sub merge_lines { my $last = pop @record; $last =~ s/^\s+//; $record[-1] =~ s/\n//; $record[-1] = join('', $record[-1], $last); my @temp = ("\n"); push @temp, @context[0..1]; $temp[3] = $record[-1]; $temp[4] = $context[4]; @context = @temp; print_linecontext(); return 0; } sub kill_line { pop @record; $context[3] = " [LINE KILLED]\n"; print_linecontext(); return 0; } sub dump_record { my ($line_in, @explanation) = @_; $recmeta{explanation} = join(' ', 'Tag', $recmeta{tag}, @explanation); my $line = ; $count++; update_linecontext(); until ($line =~ m||) { push @record, $line; $line = ; $count++; update_linecontext(); } push @record, $line; write_record($EXMARC); return 1; } sub commit_edit { return 1 } sub print_context { print "\n Tag:",$recmeta{tag}, " Ind1:'", $recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'"; print_linecontext(); return 0; } sub print_linecontext { print $OUT "\n", join(' |','',@context[0..2]); print $OUT '==> |', $context[3]; print $OUT ' |', $context[4],"\n"; return 0; } sub show_original { my ($line_in) = @_; print $OUT "\n$line_in\n"; return 0; } sub help { print $OUT <>> TRASHTAGS FILE FOUND. LOADING TAGS TO BE STRIPPED FROM OUTPUT\n"; open TRASH, '<', $conf->{trash} or die "Can't open trash tags file!\n"; while () { my $lastwasrange = 0; my %lastrange = ( high => 0, low => 0); my $except = 0; my @chunks = split /\s+/; while (my $chunk = shift @chunks) { # single values if ($chunk =~ /^\d{1,3}$/) { trash_add($chunk, $except); $lastwasrange = 0; next; } # ranges if ($chunk =~ /^\d{1,3}\.\.\d{1,3}$/) { my ($low, $high) = trash_add_range($chunk, $except, \%lastrange); $lastwasrange = 1; %lastrange = (low => $low, high => $high) unless $except; next; } # 'except' if ($chunk eq 'except') { die "Keyword 'except' can only follow a range (line $.)\n" unless $lastwasrange; die "Keyword 'except' may only occur once per line (line $.)\n" if $except; $except = 1; next; } die "Unknown chunk $chunk in .trashtags file (line $.)\n"; } } # remove original id sequence tag from trash hash if we know it trash_add($conf->{'original-tag'}, 1) if ($conf->{'original-tag'} and $trash{$conf->{'original-tag'}}); } sub trash_add_range { my ($chunk, $except, $range) = @_; my ($low,$high) = split /\.\./, $chunk; die "Ranges must be 'low..high' ($low is greater than $high on line $.)\n" if ($low > $high); if ($except) { die "Exception ranges must be within last addition range (line $.)\n" if ($low < $range->{low} or $high > $range->{high}); } for my $tag ($low..$high) { trash_add($tag, $except) } return $low, $high; } sub trash_add { my ($tag, $except) = @_; die "Trash values must be valid tags (000-999)\n" unless ($tag >= 0 and $tag <= 999); if ($except) { delete $trash{$tag}; } else { die "Trash tag '$tag' specified twice (line $.)\n" if $trash{$tag}; $trash{$tag} = 1; } } #----------------------------------------------------------------------- =head2 initialize Performs boring script initialization. Handles argument parsing, mostly. =cut sub initialize { my ($c) = @_; my @missing = (); # set mode on existing filehandles binmode(STDIN, ':utf8'); my $rc = GetOptions( $c, 'autoscrub|a', 'exception|x=s', 'output|o=s', 'nocollapse|n', 'renumber-from|rf=i', 'original-tag|ot=i', 'renumber-tag|rt=i', 'renumber-subfield|rt=i', 'trash|t=s', 'help|h', ); show_help() unless $rc; show_help() if ($c->{help}); # defaults $c->{output} = 'incoming.cleaned.marc.xml' unless defined $c->{output}; $c->{exception} = 'incoming.exception.marc.xml' unless defined $c->{exception}; $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'}; $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'}; my @keys = keys %{$c}; show_help() unless (@ARGV and @keys); #for my $key ('runtype', 'tag', 'subfield', 'output', 'exception') # { push @missing, $key unless $c->{$key} } #if (@missing) { # print "Required option: ", join(', ', @missing), " missing!\n"; # show_help(); #} } sub show_help { print < Options --output -o Cleaned MARCXML output filename (default: incoming.cleaned.marc.xml) --exception -x Exception (dumped records) MARCXML filename (incoming.exception.marc.xml) HELP exit; }