#!/usr/bin/perl use strict; use warnings; use Getopt::Long; use Term::ReadLine; my $term = new Term::ReadLine 'yaz-cleanup'; my $OUT = $term->OUT || \*STDOUT; $| = 1; # initialization and setup my $conf = {}; initialize($conf); populate_trash() if ($conf->{trashfile}); # set up files, since everything appears to be in order open MARC, '<:utf8', (shift || 'incoming.marc.xml') or die "Can't open input file $!\n"; open my $NUMARC, '>:utf8', $conf->{output} or die "Can't open output file $!\n"; open my $OLD2NEW, '>', 'old2new.map' if ($conf->{'renumber-from'} and $conf->{'original-subfield'}); my $EXMARC = 'EX'; my @record = (); # current record storage my %recmeta = (); # metadata about current record my $ptr = 0; # record index pointer # this is the dispatch table which drives command selection in # edit(), below my %commands = ( c => \&print_fullcontext, n => \&next_line, p => \&prev_line, '<' => \&widen_window, '>' => \&narrow_window, d => \&display_lines, o => \&insert_original, k => \&kill_line, y => \&yank_line, f => \&flip_line, m => \&merge_lines, s => \&substitute, t => \&commit_edit, x => \&dump_record, q => \&quit, '?' => \&help, h => \&help, help => \&help, ); my @spinner = qw(- / | \\); my $sidx = 0; while ( buildrecord() ) { unless ($conf->{ricount} % 100) { print "\rWorking... ", $spinner[$sidx]; $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1; } do_automated_cleanups(); $ptr = 0; until ($ptr == $#record) { # naked ampersands if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/) { edit("Naked ampersand"); $ptr= 0; next } if ($record[$ptr] =~ /{autoscrub}; next; } # test for existing 901/903 unless we're autocleaning them unless ($conf->{'strip-nines'}) { if ($match == 901 or $match == 903) { edit("Incoming 901/903 found in data"); next; } } } # subfields can't be non-alphanumeric if ($record[$ptr] =~ /\n"; print $OUT "\nDone. \n"; #----------------------------------------------------------------------------------- # cleanup routines #----------------------------------------------------------------------------------- sub do_automated_cleanups { $ptr = 0; until ($ptr == $#record) { # get datafield/tag data if we have it stow_record_data(); # catch empty datafield elements if ($record[$ptr] =~ m/|) { my @a = @record[0 .. $ptr - 1]; my @b = @record[$ptr + 2 .. $#record]; @record = (@a, @b); message("Empty datafield scrubbed"); $ptr = 0; next; } } # and quasi-empty subfields if ($record[$ptr] =~ m|(.*?)(.+?)|) { my $leader = $1; if (length $leader < 24) { $leader .= ' ' x (20 - length($leader)); $leader .= "4500"; $record[$ptr] = "$leader\n"; message("Short leader padded"); } } if ($record[$ptr] =~ m|(.+?)$content\n"; message("Short 008 padded"); } } # clean misplaced dollarsigns if ($record[$ptr] =~ m|c?\d+\.\d{2}|) { $record[$ptr] =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|; message("Dollar sign corrected"); } # clean up tags with spaces in them $record[$ptr] =~ s/tag=" /tag="00/g; $record[$ptr] =~ s/tag=" /tag="0/g; $record[$ptr] =~ s/tag="-/tag="0/g; $record[$ptr] =~ s/tag="(\d\d) /tag="0$1/g; # automatable subfield maladies $record[$ptr] =~ s/code=" ">c/code="c">/; $record[$ptr] =~ s/code=" ">\$/code="c">$/; } } sub stow_record_data { # get tag data if we're looking at it if ($record[$ptr] =~ m/{'original-subfield'} and $recmeta{tag} == $conf->{'original-tag'}) { my $line = $record[$ptr]; my $lptr = $ptr; my $osub = $conf->{'original-subfield'}; $recmeta{oid} = 'NONE'; until ($line =~ m||) { $lptr++; $line = $record[$lptr]; $recmeta{oid} = $+{TAG} if ($line =~ /(.+?){trash}{ $recmeta{tag} }; $conf->{editmsg} = $msg; print_fullcontext(); # stow original problem line $recmeta{origline} = $record[$ptr]; while (1) { my $line = $term->readline('marc-cleanup>'); my @chunks = split /\s+/, $line; # lines with single-character first chunks are commands. # make sure they exist. if (length $chunks[0] == 1) { unless (defined $commands{$chunks[0]}) { print $OUT "No such command '", $chunks[0], "'\n"; next; } } if (defined $commands{$chunks[0]}) { my $term = $commands{$chunks[0]}->(@chunks[1..$#chunks]); last if $term; } else { $recmeta{prevline} = $record[$ptr]; $record[$ptr] = "$line\n"; print_context(); } } # set pointer to top on the way out $ptr = 0; } =head2 buildrecord Constructs record arrays from the incoming MARC file and returns them to the driver loop. =cut sub buildrecord { my $l = ''; $l = while (defined $l and $l !~ //); return $l unless defined $l; @record = (); %recmeta = (); $conf->{ricount}++; until ($l =~ m||) { push @record, $l; $l = ; } push @record, $l; return 1; } sub write_record { my ($FH) = @_; my $trash = $conf->{trash}; if ($FH eq 'EX') { $EXMARC = undef; open $EXMARC, '>:utf8', $conf->{exception} or die "Can't open exception file $!\n"; $FH = $EXMARC; } $conf->{rocount}++ if ($FH eq $NUMARC); print $FH '\n" if(defined $recmeta{explanation}); # excise unwanted tags if (keys %{$trash} or $conf->{autoscrub}) { my @trimmed = (); my $istrash = 0; for my $line (@record) { if ($istrash) { $istrash = 0 if $line =~ m|{$tag} or ($conf->{autoscrub} and $tag =~ /\D/)) { $istrash = 1; next } } push @trimmed, $line; } @record = @trimmed; } # add 903(?) with new record id my $renumber = ''; if ($conf->{'renumber-from'}) { $recmeta{nid} = $conf->{'renumber-from'}; $renumber = join('', ' ', $recmeta{nid}, "\n"); my @tmp = @record[0 .. $#record - 1]; my $last = $record[$#record]; @record = (@tmp, $renumber, $last); $conf->{'renumber-from'}++; } # scrub newlines (unless told not to or writing exception record) unless ($conf->{nocollapse} or $FH eq $EXMARC) { s/\n// for (@record) } # write to old->new map file if needed if ($conf->{'renumber-from'} and $conf->{'original-subfield'}) { unless (defined $recmeta{oid}) { my $msg = join(' ', "No old id num found"); dump_record($msg); } else { print $OLD2NEW $recmeta{oid}, "\t", $recmeta{nid}, "\n" } } # actually write the record print $FH @record,"\n"; # if we were dumping to exception file, nuke the record and set ptr # to terminate processing loop @record = ('a'); $ptr = 0; } sub print_fullcontext { print $OUT "\r", ' ' x 72, "\n"; print $OUT $conf->{editmsg},"\n"; print $OUT "\r Tag:",$recmeta{tag}, " Ind1:'", $recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'"; print $OUT " @ ", $conf->{ricount}, "/", $conf->{rocount} + 1; print_context(); return 0; } sub print_context { my $upper = int($conf->{window} / 2) + 1; my $lower = int($conf->{window} / 2) - 1; my $start = ($ptr - $upper < 0) ? 0 : $ptr - $upper; my $stop = ($ptr + $lower > $#record) ? $#record : $ptr + $lower; print $OUT "\n"; print $OUT ' |', $record[$_] for ($start .. $ptr - 1); print $OUT '==> |', $record[$ptr]; print $OUT ' |', $record[$_] for ($ptr + 1 .. $stop); print $OUT "\n"; return 0; } sub message { my ($msg) = @_; print $OUT "\r$msg at ",$conf->{ricount},"/",$conf->{rocount} + 1,"\n"; } #----------------------------------------------------------------------------------- # command routines #----------------------------------------------------------------------------------- sub substitute { my (@chunks) = @_; my $ofrom = shift @chunks; if ($ofrom =~ /^'/) { until ($ofrom =~ /'$/ or !@chunks) { $ofrom .= join(' ','',shift @chunks) } $ofrom =~ s/^'//; $ofrom =~ s/'$//; } my $to = shift @chunks; if ($to =~ /^'/) { until ($to =~ /'$/ or !@chunks) { $to .= join(' ','',shift @chunks) } $to =~ s/^'//; $to =~ s/'$//; } my $from = ''; for my $char (split(//,$ofrom)) { $char = "\\" . $char if ($char =~ /\W/); $from = join('', $from, $char); } $recmeta{prevline} = $record[$ptr]; $record[$ptr] =~ s/$from/$to/; print_context(); return 0; } sub merge_lines { $recmeta{prevline} = $record[$ptr]; # remove //; # and move to front of line $record[$ptr] = join(' ', $1 , $record[$ptr]); # tear off trailing subfield tag from preceeding line $record[$ptr - 1] =~ s|\n||; # join current line onto preceeding line $record[$ptr - 1] = join('', $record[$ptr - 1], $record[$ptr]); # erase current line my @a = @record[0 .. $ptr - 1]; my @b = @record[$ptr + 1 .. $#record]; @record = (@a, @b); # move record pointer to previous line prev_line(); print_context(); return 0; } sub flip_line { unless ($recmeta{prevline}) { print $OUT "No previously edited line to flip\n"; return } my $temp = $record[$ptr]; $record[$ptr] = $recmeta{prevline}; $recmeta{prevline} = $temp; print_context(); return 0; } sub kill_line { $recmeta{killline} = $record[$ptr]; my @a = @record[0 .. $ptr - 1]; my @b = @record[$ptr + 1 .. $#record]; @record = (@a, @b); print_context(); return 0; } sub yank_line { unless ($recmeta{killline}) { print $OUT "No killed line to yank\n"; return } my @a = @record[0 .. $ptr - 1]; my @b = @record[$ptr .. $#record]; @record = (@a, $conf->{killline}, @b); print_context(); return 0; } sub insert_original { $record[$ptr] = $recmeta{origline}; print_context(); return 0; } sub display_lines { print $OUT "\nOrig. edit line :", $recmeta{origline}; print $OUT "Current flip line:", $recmeta{prevline} if $recmeta{prevline}; print $OUT "Last killed line :", $recmeta{killline} if $recmeta{killline}; print $OUT "\n"; return 0; } sub dump_record { my (@explanation) = @_; print $OUT @explanation; $recmeta{explanation} = join(' ', 'Tag', $recmeta{tag}, @explanation); write_record($EXMARC); return 1; } sub next_line { $ptr++ unless ($ptr == $#record);; print_context(); return 0; } sub prev_line { $ptr-- unless ($ptr == 0); print_context(); return 0; } sub commit_edit { return 1 } sub widen_window { if ($conf->{window} == 15) { print $OUT "Window can't be bigger than 15 lines\n"; return } $conf->{window} += 2; print_context; } sub narrow_window { if ($conf->{window} == 5) { print $OUT "Window can't be smaller than 5 lines\n"; return } $conf->{window} -= 2; print_context; } sub help { print $OUT < Contract context window | y Yank last killed line p Move pointer to prev line | m Merge current line into preceding line n Move pointer to next line | o Insert original line c Print line context | f Flip current line and last edited line d Print current saved lines | -----------------------------+------------------------------------------- s Subtitute; replace ARG1 in current line with ARG2. If either ARG contains spaces, it must be single-quoted t Commit changes and resume automated operations x Dump record to exception file q Quit HELP return 0; } sub quit { exit } #----------------------------------------------------------------------------------- # populate_trash #----------------------------------------------------------------------------------- # defined a domain-specific language for specifying MARC tags to be dropped from # records during processing. it is line oriented, and is specified as follows: # # each line may specify any number of tags to be included, either singly (\d{1,3}) # or as a range (\d{1,3}\.\.\d{1,3} # # if a single number is given, it must be between '000' and '999', inclusive. # # ranges obey the previous rule, and also the first number of the range must be less # than the second number # # finally, any single range in a line may be followed by the keyword 'except'. every # number or range after 'except' is excluded from the range specified. all these # numbers must actually be within the range. # # specifying a tag twice is an error, to help prevent typos sub populate_trash { print $OUT ">>> TRASHTAGS FILE FOUND. LOADING TAGS TO BE STRIPPED FROM OUTPUT\n"; open TRASH, '<', $conf->{trashfile} or die "Can't open trash tags file!\n"; while () { my $lastwasrange = 0; my %lastrange = ( high => 0, low => 0); my $except = 0; my @chunks = split /\s+/; while (my $chunk = shift @chunks) { # single values if ($chunk =~ /^\d{1,3}$/) { trash_add($chunk, $except); $lastwasrange = 0; next; } # ranges if ($chunk =~ /^\d{1,3}\.\.\d{1,3}$/) { my ($low, $high) = trash_add_range($chunk, $except, \%lastrange); $lastwasrange = 1; %lastrange = (low => $low, high => $high) unless $except; next; } # 'except' if ($chunk eq 'except') { die "Keyword 'except' can only follow a range (line $.)\n" unless $lastwasrange; die "Keyword 'except' may only occur once per line (line $.)\n" if $except; $except = 1; next; } die "Unknown chunk $chunk in .trashtags file (line $.)\n"; } } # remove original id sequence tag from trash hash if we know it trash_add($conf->{'original-tag'}, 1) if ($conf->{'original-tag'} and $conf->{trash}{ $conf->{'original-tag'} }); } sub trash_add_range { my ($chunk, $except, $range) = @_; my ($low,$high) = split /\.\./, $chunk; die "Ranges must be 'low..high' ($low is greater than $high on line $.)\n" if ($low > $high); if ($except) { die "Exception ranges must be within last addition range (line $.)\n" if ($low < $range->{low} or $high > $range->{high}); } for my $tag ($low..$high) { trash_add($tag, $except) } return $low, $high; } sub trash_add { my ($tag, $except) = @_; my $trash = $conf->{trash}; die "Trash values must be valid tags (000-999)\n" unless ($tag >= 0 and $tag <= 999); if ($except) { delete $trash->{$tag}; } else { die "Trash tag '$tag' specified twice (line $.)\n" if $trash->{$tag}; $trash->{$tag} = 1; } } #----------------------------------------------------------------------- =head2 initialize Performs boring script initialization. Handles argument parsing, mostly. =cut sub initialize { my ($c) = @_; my @missing = (); # set mode on existing filehandles binmode(STDIN, ':utf8'); my $rc = GetOptions( $c, 'autoscrub|a', 'exception|x=s', 'output|o=s', 'prefix|p=s', 'nocollapse|n', 'renumber-from|rf=i', 'renumber-tag|rt=i', 'renumber-subfield|rs=s', 'original-tag|ot=i', 'original-subfield|os=s', 'script', 'strip-nines', 'trashfile|t=s', 'trashhelp', 'help|h', ); show_help() unless $rc; show_help() if ($c->{help}); show_trashhelp() if ($c->{trashhelp}); # defaults if ($c->{prefix}) { $c->{output} = join('.',$c->{prefix},'clean','marc','xml'); $c->{exception} = join('.',$c->{prefix},'marc','ex'); } $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'}; $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'}; $c->{window} = 5; # autotrash 901, 903 if strip-nines if ($c->{'strip-nines'}) { $c->{trash}{901} = 1; $c->{trash}{903} = 1; } my @keys = keys %{$c}; show_help() unless (@ARGV and @keys); } sub show_help { print < Options --output -o Cleaned MARCXML output filename --exception -x Exception (dumped records) MARCXML filename or --prefix=> -p Shared prefix for output/exception files. Will produce PREFIX.clean.marc.xml and PREFIX.ex.xml --renumber-from -rf Begin renumbering id sequence with this number --renumber-tag -rt Tag to use in renumbering (default: 903) --renumber-subfield -rs Subfield code to use in renumbering (default: a) --original-tag -ot Original id tag; will be kept in output even if it appears in the trash file --original-subfield -os Original id subfield code. If this is specified and renumbering is in effect, an old-to-new mapping file (old2new.map) will be generated. --autoscrub -a Automatically remove non-numeric tags in data --nocollapse -n Don't compress records to one line on output --strip-nines Automatically remove any existing 901/903 tags in data --trashfile -t File containing trash tag data (see --trashhelp) --script Store human-initiated ops in scriptfile (.mcscript) Not yet implemented HELP exit; } sub show_trashhelp { print <