#!/usr/bin/perl use strict; use warnings; use Getopt::Long; use Term::ReadLine; $| = 1; my $term = new Term::ReadLine 'yaz-cleanup'; my $OUT = $term->OUT || \*STDOUT; # initialization and setup my $conf = {}; initialize($conf); populate_trash() if ($conf->{trash}); my @record = (); # current record storage my %recmeta = (); # metadata about current record my $recptr = 0; # record index pointer my $input = shift || 'incoming.marc.xml'; open MARC, '<:utf8', $input; open my $NUMARC, '>:utf8', $conf->{output}; print $NUMARC '',"\n"; print $NUMARC '',"\n"; open my $EXMARC, '>:utf8', $conf->{exception}; # this is the dispatch table which drives command selection in # edit(), below my %commands = ( c => \&print_context, C => \&print_linecontext, o => \&show_original, f => \&flip_lines, k => \&kill_line, m => \&merge_lines, n => \&next_line, p => \&prev_line, s => \&substitute, t => \&commit_edit, x => \&dump_record, q => \&quit, '?' => \&help, h => \&help, help => \&help, ); my @spinner = qw(- / | \\); my $sidx = 0; while ( buildrecord() ) { unless ($conf->{ricount} % 100) { print "\rWorking... ", $spinner[$sidx]; $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1; } do_automated_cleanups(); $recptr = 0; until ($recptr == $#record) { # naked ampersands if ($record[$recptr] =~ /&/ && $record[$recptr] !~ /&\w+?;/) { edit("Naked ampersand"); $recptr= 0; next } # tags must be numeric if ($record[$recptr] =~ /\n"; print $OUT "\nDone. \n"; #----------------------------------------------------------------------------------- # cleanup routines #----------------------------------------------------------------------------------- sub do_automated_cleanups { $recptr = 0; until ($recptr == $#record) { # catch empty datafield elements if ($record[$recptr] =~ m//) { if ($record[$recptr + 1] =~ m||) { my @a = @record[0 .. $recptr - 1]; my @b = @record[$recptr + 2 .. $#record]; @record = (@a, @b); message("Empty datafield scrubbed"); $recptr = 0; next; } } # and quasi-empty subfields if ($record[$recptr] =~ m|(.*?)(.+?)|) { my $leader = $1; if (length $leader < 24) { $leader .= ' ' x (20 - length($leader)); $leader .= "4500"; $record[$recptr] = "$leader\n"; message("Short leader padded"); } } if ($record[$recptr] =~ m|(.+?)$content\n"; message("Short 008 padded"); } } # clean misplaced dollarsigns if ($record[$recptr] =~ m|c?\d+\.\d{2}|) { $record[$recptr] =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|; message("Dollar sign corrected"); } # clean up tags with spaces in them $record[$recptr] =~ s/tag=" /tag="00/g; $record[$recptr] =~ s/tag=" /tag="0/g; $record[$recptr] =~ s/tag="-/tag="0/g; $record[$recptr] =~ s/tag="(\d\d) /tag="0$1/g; # stow tag data if we're looking at it if ($record[$recptr] =~ m//) { $recmeta{tag} = $1; $recmeta{ind1} = $2; $recmeta{ind2} = $3; } # automatable subfield maladies $record[$recptr] =~ s/code=" ">c/code="c">/; $record[$recptr] =~ s/code=" ">\$/code="c"$>/; } } #----------------------------------------------------------------------------------- # driver routines #----------------------------------------------------------------------------------- =head2 edit Handles the Term::ReadLine loop =cut sub edit { my ($msg) = @_; return if $conf->{trash}{ $recmeta{tag} }; message($msg, 1); print_context(); # stow original problem line $conf->{origline} = $record[$recptr]; while (1) { my $line = $term->readline('marc-cleanup>'); my @chunks = split /\s+/, $line; if (length $chunks[0] == 1) { next unless (defined $commands{$chunks[0]}) } if (defined $commands{$chunks[0]}) { my $term = $commands{$chunks[0]}->(@chunks[1..$#chunks]); last if $term; } else { $record[$recptr] = "$line\n"; print_linecontext(); } } } =head2 buildrecord Constructs record arrays from the incoming MARC file and returns them to the driver loop. =cut sub buildrecord { my $l = ''; $l = while (defined $l and $l !~ //); return $l unless defined $l; @record = ($l); %recmeta = (); $conf->{ricount}++; until ($l =~ m||) { push @record, $l; $l = ; } push @record, $l; return 1; } sub write_record { my ($FH) = @_; my $trash = $conf->{trash}; $conf->{rocount}++ if ($FH eq $NUMARC); print $FH '\n" if(defined $recmeta{explanation}); # excise unwanted tags if (keys %{$trash} or $conf->{autoscrub}) { my @trimmed = (); my $istrash = 0; for my $line (@record) { if ($istrash) { $istrash = 0 if $line =~ m|{$tag} or ($conf->{autoscrub} and $tag =~ /\D/)) { $istrash = 1; next } } push @trimmed, $line; } @record = @trimmed; } # scrub newlines unless ($conf->{nocollapse}) { s/\n// for (@record); } # add 903(?) with new record id my $renumber = ''; if ($conf->{'renumber-from'}) { $renumber = join('', '', '', $conf->{'renumber-from'}, ''); $renumber .= "\n" if $conf->{nocollapse}; push @record, $renumber; $conf->{'renumber-from'}++; } print $FH @record; print $FH "\n"; } sub print_context { print " Tag:",$recmeta{tag}, " Ind1:'", $recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'\n"; print_linecontext(); return 0; } sub print_linecontext { my $low = ($recptr - 3 < 0) ? 0 : $recptr - 3; print $OUT ' |', $record[$_] for ($low .. $recptr - 1); print $OUT '==> |', $record[$recptr]; print $OUT ' |', $record[$recptr + 1], "\n"; return 0; } sub message { my ($msg, $new) = @_; print $OUT "\r", ' ' x 72, "\n" if $new; print $OUT "\r$msg at ",$conf->{ricount},"/",$conf->{rocount} + 1,"\n"; } #----------------------------------------------------------------------------------- # command routines #----------------------------------------------------------------------------------- sub substitute { my ($line_in, @chunks) = @_; my $ofrom = shift @chunks; if ($ofrom =~ /^'/ or !@chunks) { until ($ofrom =~ /'$/) { $ofrom .= join(' ','',shift @chunks) } $ofrom =~ s/^'//; $ofrom =~ s/'$//; } my $to = shift @chunks; if ($to =~ /^'/) { until ($to =~ /'$/ or !@chunks) { $to .= join(' ','',shift @chunks) } $to =~ s/^'//; $to =~ s/'$//; } my $from = ''; for my $char (split(//,$ofrom)) { $char = "\\" . $char if ($char =~ /\W/); $from = join('', $from, $char); } $record[$recptr] =~ s/$from/$to/; print_linecontext(); return 0; } sub merge_lines { # remove //; # and move to front of line $record[$recptr] = join(' ', $1 , $record[$recptr]); # tear off trailing subfield tag from preceeding line $record[$recptr - 1] =~ s|\n||; # join current line onto preceeding line $record[$recptr - 1] = join('', $record[$recptr - 1], $record[$recptr]); # erase current line my @a = @record[0 .. $recptr - 1]; my @b = @record[$recptr + 1 .. $#record]; @record = (@a, @b); # move record pointer to previous line prev_line(); print_linecontext(); return 0; } sub kill_line { my @a = @record[0 .. $recptr - 1]; my @b = @record[$recptr + 1 .. $#record]; @record = (@a, @b); print_linecontext(); return 0; } sub dump_record { my ($line_in, @explanation) = @_; $recmeta{explanation} = join(' ', 'Tag', $recmeta{tag}, @explanation); write_record($EXMARC); return 1; } sub next_line { $recptr++; print_linecontext(); return 0; } sub prev_line { $recptr--; print_linecontext(); return 0; } sub show_original { my ($line_in) = @_; print $OUT "\n", $conf->{origline}, "\n"; return 0; } sub commit_edit { return 1 } sub help { print $OUT <>> TRASHTAGS FILE FOUND. LOADING TAGS TO BE STRIPPED FROM OUTPUT\n"; open TRASH, '<', $conf->{trash} or die "Can't open trash tags file!\n"; while () { my $lastwasrange = 0; my %lastrange = ( high => 0, low => 0); my $except = 0; my @chunks = split /\s+/; while (my $chunk = shift @chunks) { # single values if ($chunk =~ /^\d{1,3}$/) { trash_add($chunk, $except); $lastwasrange = 0; next; } # ranges if ($chunk =~ /^\d{1,3}\.\.\d{1,3}$/) { my ($low, $high) = trash_add_range($chunk, $except, \%lastrange); $lastwasrange = 1; %lastrange = (low => $low, high => $high) unless $except; next; } # 'except' if ($chunk eq 'except') { die "Keyword 'except' can only follow a range (line $.)\n" unless $lastwasrange; die "Keyword 'except' may only occur once per line (line $.)\n" if $except; $except = 1; next; } die "Unknown chunk $chunk in .trashtags file (line $.)\n"; } } # remove original id sequence tag from trash hash if we know it trash_add($conf->{'original-tag'}, 1) if ($conf->{'original-tag'} and $conf->{trash}{ $conf->{'original-tag'} }); } sub trash_add_range { my ($chunk, $except, $range) = @_; my ($low,$high) = split /\.\./, $chunk; die "Ranges must be 'low..high' ($low is greater than $high on line $.)\n" if ($low > $high); if ($except) { die "Exception ranges must be within last addition range (line $.)\n" if ($low < $range->{low} or $high > $range->{high}); } for my $tag ($low..$high) { trash_add($tag, $except) } return $low, $high; } sub trash_add { my ($tag, $except) = @_; my $trash = $conf->{trash}; die "Trash values must be valid tags (000-999)\n" unless ($tag >= 0 and $tag <= 999); if ($except) { delete $trash->{$tag}; } else { die "Trash tag '$tag' specified twice (line $.)\n" if $trash->{$tag}; $trash->{$tag} = 1; } } #----------------------------------------------------------------------- =head2 initialize Performs boring script initialization. Handles argument parsing, mostly. =cut sub initialize { my ($c) = @_; my @missing = (); # set mode on existing filehandles binmode(STDIN, ':utf8'); my $rc = GetOptions( $c, 'autoscrub|a', 'exception|x=s', 'output|o=s', 'nocollapse|n', 'renumber-from|rf=i', 'original-tag|ot=i', 'renumber-tag|rt=i', 'renumber-subfield|rt=i', 'trash|t=s', 'help|h', ); show_help() unless $rc; show_help() if ($c->{help}); # defaults $c->{output} = 'incoming.cleaned.marc.xml' unless defined $c->{output}; $c->{exception} = 'incoming.exception.marc.xml' unless defined $c->{exception}; $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'}; $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'}; my @keys = keys %{$c}; show_help() unless (@ARGV and @keys); #for my $key ('runtype', 'tag', 'subfield', 'output', 'exception') # { push @missing, $key unless $c->{$key} } #if (@missing) { # print "Required option: ", join(', ', @missing), " missing!\n"; # show_help(); #} } sub show_help { print < Options --output -o Cleaned MARCXML output filename (default: incoming.cleaned.marc.xml) --exception -x Exception (dumped records) MARCXML filename (incoming.exception.marc.xml) HELP exit; }