From 9edd65e09a65fb042313081490a79f7e5ef33406 Mon Sep 17 00:00:00 2001 From: Shawn Boyette Date: Fri, 19 Dec 2008 17:55:27 +0000 Subject: [PATCH 1/1] been meaning to rename that for a while --- marc-cleanup | 751 ---------------------------------------------------------- marc_cleanup | 751 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 751 insertions(+), 751 deletions(-) delete mode 100755 marc-cleanup create mode 100755 marc_cleanup diff --git a/marc-cleanup b/marc-cleanup deleted file mode 100755 index f40babb..0000000 --- a/marc-cleanup +++ /dev/null @@ -1,751 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; - -use Getopt::Long; -use Term::ReadLine; - -my $term = new Term::ReadLine 'yaz-cleanup'; -my $OUT = $term->OUT || \*STDOUT; - -$| = 1; - -# initialization and setup -my $conf = {}; -initialize($conf); -populate_trash() if ($conf->{trashfile}); - -# set up files, since everything appears to be in order -open MARC, '<:utf8', (shift || 'incoming.marc.xml') - or die "Can't open input file $!\n"; -open my $NUMARC, '>:utf8', $conf->{output} - or die "Can't open output file $!\n"; -open my $OLD2NEW, '>', 'old2new.map' - if ($conf->{'renumber-from'} and $conf->{'original-subfield'}); -my $EXMARC = 'EX'; - - -my @record = (); # current record storage -my %recmeta = (); # metadata about current record -my $ptr = 0; # record index pointer - -# this is the dispatch table which drives command selection in -# edit(), below -my %commands = ( c => \&print_fullcontext, - n => \&next_line, - p => \&prev_line, - '<' => \&widen_window, - '>' => \&narrow_window, - d => \&display_lines, - o => \&insert_original, - k => \&kill_line, - y => \&yank_line, - f => \&flip_line, - m => \&merge_lines, - s => \&substitute, - t => \&commit_edit, - x => \&dump_record, - q => \&quit, - '?' => \&help, - h => \&help, - help => \&help, - ); - -my @spinner = qw(- / | \\); -my $sidx = 0; - -while ( buildrecord() ) { - unless ($conf->{ricount} % 100) { - print "\rWorking... ", $spinner[$sidx]; - $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1; - } - - do_automated_cleanups(); - - $ptr = 0; - until ($ptr == $#record) { - # naked ampersands - if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/) - { edit("Naked ampersand"); $ptr= 0; next } - - if ($record[$ptr] =~ /{autoscrub}; - next; - } - # test for existing 901/903 unless we're autocleaning them - unless ($conf->{'strip-nines'}) { - if ($match == 901 or $match == 903) { - edit("Incoming 901/903 found in data"); - next; - } - } - } - - # subfields can't be non-alphanumeric - if ($record[$ptr] =~ /\n"; -print $OUT "\nDone. \n"; - - -#----------------------------------------------------------------------------------- -# cleanup routines -#----------------------------------------------------------------------------------- - -sub do_automated_cleanups { - $ptr = 0; - until ($ptr == $#record) { - # get datafield/tag data if we have it - stow_record_data(); - - # catch empty datafield elements - if ($record[$ptr] =~ m/|) { - my @a = @record[0 .. $ptr - 1]; - my @b = @record[$ptr + 2 .. $#record]; - @record = (@a, @b); - message("Empty datafield scrubbed"); - $ptr = 0; - next; - } - } - # and quasi-empty subfields - if ($record[$ptr] =~ m|(.*?)(.+?)|) { - my $leader = $1; - if (length $leader < 24) { - $leader .= ' ' x (20 - length($leader)); - $leader .= "4500"; - $record[$ptr] = "$leader\n"; - message("Short leader padded"); - } - } - if ($record[$ptr] =~ m|(.+?)$content\n"; - message("Short 008 padded"); - } - } - - # clean misplaced dollarsigns - if ($record[$ptr] =~ m|c?\d+\.\d{2}|) { - $record[$ptr] =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|; - message("Dollar sign corrected"); - } - - # clean up tags with spaces in them - $record[$ptr] =~ s/tag=" /tag="00/g; - $record[$ptr] =~ s/tag=" /tag="0/g; - $record[$ptr] =~ s/tag="-/tag="0/g; - $record[$ptr] =~ s/tag="(\d\d) /tag="0$1/g; - - # automatable subfield maladies - $record[$ptr] =~ s/code=" ">c/code="c">/; - $record[$ptr] =~ s/code=" ">\$/code="c"$>/; - } -} - -sub stow_record_data { - # get tag data if we're looking at it - - if ($record[$ptr] =~ m/{'original-subfield'} and - $recmeta{tag} == $conf->{'original-tag'}) { - my $line = $record[$ptr]; my $lptr = $ptr; - my $osub = $conf->{'original-subfield'}; - $recmeta{oid} = 'NONE'; - - until ($line =~ m||) { - $lptr++; - $line = $record[$lptr]; - $recmeta{oid} = $+{TAG} - if ($line =~ /(.+?){trash}{ $recmeta{tag} }; - $conf->{editmsg} = $msg; - print_fullcontext(); - - # stow original problem line - $recmeta{origline} = $record[$ptr]; - - while (1) { - my $line = $term->readline('marc-cleanup>'); - my @chunks = split /\s+/, $line; - - # lines with single-character first chunks are commands. - # make sure they exist. - if (length $chunks[0] == 1) { - unless (defined $commands{$chunks[0]}) { - print $OUT "No such command '", $chunks[0], "'\n"; - next; - } - } - - if (defined $commands{$chunks[0]}) { - my $term = $commands{$chunks[0]}->(@chunks[1..$#chunks]); - last if $term; - } else { - $recmeta{prevline} = $record[$ptr]; - $record[$ptr] = "$line\n"; - print_context(); - } - } - # set pointer to top on the way out - $ptr = 0; -} - -=head2 buildrecord - -Constructs record arrays from the incoming MARC file and returns them -to the driver loop. - -=cut - -sub buildrecord { - my $l = ''; - $l = while (defined $l and $l !~ //); - return $l unless defined $l; - @record = (); - %recmeta = (); - $conf->{ricount}++; - - until ($l =~ m||) - { push @record, $l; $l = ; } - push @record, $l; - return 1; -} - -sub write_record { - my ($FH) = @_; - my $trash = $conf->{trash}; - - if ($FH eq 'EX') { - $EXMARC = undef; - open $EXMARC, '>:utf8', $conf->{exception} - or die "Can't open exception file $!\n"; - $FH = $EXMARC; - } - - $conf->{rocount}++ if ($FH eq $NUMARC); - print $FH '\n" - if(defined $recmeta{explanation}); - - # excise unwanted tags - if (keys %{$trash} or $conf->{autoscrub}) { - my @trimmed = (); - my $istrash = 0; - for my $line (@record) { - if ($istrash) { - $istrash = 0 if $line =~ m|{$tag} or ($conf->{autoscrub} and $tag =~ /\D/)) { - $istrash = 1; - next - } - } - push @trimmed, $line; - } - @record = @trimmed; - } - - # add 903(?) with new record id - my $renumber = ''; - if ($conf->{'renumber-from'}) { - $recmeta{nid} = $conf->{'renumber-from'}; - $renumber = join('', ' ', $recmeta{nid}, "\n"); - my @tmp = @record[0 .. $#record - 1]; - my $last = $record[$#record]; - @record = (@tmp, $renumber, $last); - $conf->{'renumber-from'}++; - } - - # scrub newlines (unless told not to or writing exception record) - unless ($conf->{nocollapse} or $FH eq $EXMARC) - { s/\n// for (@record) } - - # write to old->new map file if needed - if ($conf->{'renumber-from'} and $conf->{'original-subfield'}) { - unless (defined $recmeta{oid}) { - my $msg = join(' ', "No old id num found"); - dump_record($msg); - } else { - print $OLD2NEW $recmeta{oid}, "\t", $recmeta{nid}, "\n" - } - } - - # actually write the record - print $FH @record,"\n"; - - # if we were dumping to exception file, nuke the record and set ptr - # to terminate processing loop - @record = ('a'); - $ptr = 0; -} - -sub print_fullcontext { - print $OUT "\r", ' ' x 72, "\n"; - print $OUT $conf->{editmsg},"\n"; - print $OUT "\r Tag:",$recmeta{tag}, " Ind1:'", - $recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'"; - print $OUT " @ ", $conf->{ricount}, "/", $conf->{rocount} + 1; - print_context(); - return 0; -} - -sub print_context { - my $upper = int($conf->{window} / 2) + 1; - my $lower = int($conf->{window} / 2) - 1; - my $start = ($ptr - $upper < 0) ? 0 : $ptr - $upper; - my $stop = ($ptr + $lower > $#record) ? $#record : $ptr + $lower; - print $OUT "\n"; - print $OUT ' |', $record[$_] for ($start .. $ptr - 1); - print $OUT '==> |', $record[$ptr]; - print $OUT ' |', $record[$_] for ($ptr + 1 .. $stop); - print $OUT "\n"; - return 0; -} - -sub message { - my ($msg) = @_; - print $OUT "\r$msg at ",$conf->{ricount},"/",$conf->{rocount} + 1,"\n"; -} - -#----------------------------------------------------------------------------------- -# command routines -#----------------------------------------------------------------------------------- - -sub substitute { - my (@chunks) = @_; - - my $ofrom = shift @chunks; - if ($ofrom =~ /^'/) { - until ($ofrom =~ /'$/ or !@chunks) - { $ofrom .= join(' ','',shift @chunks) } - $ofrom =~ s/^'//; $ofrom =~ s/'$//; - } - my $to = shift @chunks; - if ($to =~ /^'/) { - until ($to =~ /'$/ or !@chunks) - { $to .= join(' ','',shift @chunks) } - $to =~ s/^'//; $to =~ s/'$//; - } - - my $from = ''; - for my $char (split(//,$ofrom)) { - $char = "\\" . $char if ($char =~ /\W/); - $from = join('', $from, $char); - } - - $recmeta{prevline} = $record[$ptr]; - $record[$ptr] =~ s/$from/$to/; - print_context(); - return 0; -} - -sub merge_lines { - $recmeta{prevline} = $record[$ptr]; - # remove //; - # and move to front of line - $record[$ptr] = join(' ', $1 , $record[$ptr]); - # tear off trailing subfield tag from preceeding line - $record[$ptr - 1] =~ s|\n||; - # join current line onto preceeding line - $record[$ptr - 1] = join('', $record[$ptr - 1], $record[$ptr]); - # erase current line - my @a = @record[0 .. $ptr - 1]; - my @b = @record[$ptr + 1 .. $#record]; - @record = (@a, @b); - # move record pointer to previous line - prev_line(); - print_context(); - return 0; -} - -sub flip_line { - unless ($recmeta{prevline}) - { print $OUT "No previously edited line to flip\n"; return } - my $temp = $record[$ptr]; - $record[$ptr] = $recmeta{prevline}; - $recmeta{prevline} = $temp; - print_context(); - return 0; -} - -sub kill_line { - $recmeta{killline} = $record[$ptr]; - my @a = @record[0 .. $ptr - 1]; - my @b = @record[$ptr + 1 .. $#record]; - @record = (@a, @b); - print_context(); - return 0; -} - -sub yank_line { - unless ($recmeta{killline}) - { print $OUT "No killed line to yank\n"; return } - my @a = @record[0 .. $ptr - 1]; - my @b = @record[$ptr .. $#record]; - @record = (@a, $conf->{killline}, @b); - print_context(); - return 0; -} - -sub insert_original { - $record[$ptr] = $recmeta{origline}; - print_context(); - return 0; -} - -sub display_lines { - print $OUT "\nOrig. edit line :", $recmeta{origline}; - print $OUT "Current flip line:", $recmeta{prevline} if $recmeta{prevline}; - print $OUT "Last killed line :", $recmeta{killline} if $recmeta{killline}; - print $OUT "\n"; - return 0; -} - -sub dump_record { - my (@explanation) = @_; - print $OUT @explanation; - $recmeta{explanation} = join(' ', 'Tag', $recmeta{tag}, @explanation); - write_record($EXMARC); - return 1; -} - -sub next_line { - $ptr++ unless ($ptr == $#record);; - print_context(); - return 0; -} - -sub prev_line { - $ptr-- unless ($ptr == 0); - print_context(); - return 0; -} - -sub commit_edit { return 1 } - -sub widen_window { - if ($conf->{window} == 15) - { print $OUT "Window can't be bigger than 15 lines\n"; return } - $conf->{window} += 2; - print_context; -} - -sub narrow_window { - if ($conf->{window} == 5) - { print $OUT "Window can't be smaller than 5 lines\n"; return } - $conf->{window} -= 2; - print_context; -} - -sub help { -print $OUT < Contract context window | y Yank last killed line -p Move pointer to prev line | m Merge current line into preceding line -n Move pointer to next line | o Insert original line -c Print line context | f Flip current line and last edited line -d Print current saved lines | ------------------------------+------------------------------------------- -s Subtitute; replace ARG1 in current line with ARG2. If either ARG - contains spaces, it must be single-quoted -t Commit changes and resume automated operations -x Dump record to exception file -q Quit - -HELP -return 0; -} - -sub quit { exit } - -#----------------------------------------------------------------------------------- -# populate_trash -#----------------------------------------------------------------------------------- -# defined a domain-specific language for specifying MARC tags to be dropped from -# records during processing. it is line oriented, and is specified as follows: -# -# each line may specify any number of tags to be included, either singly (\d{1,3}) -# or as a range (\d{1,3}\.\.\d{1,3} -# -# if a single number is given, it must be between '000' and '999', inclusive. -# -# ranges obey the previous rule, and also the first number of the range must be less -# than the second number -# -# finally, any single range in a line may be followed by the keyword 'except'. every -# number or range after 'except' is excluded from the range specified. all these -# numbers must actually be within the range. -# -# specifying a tag twice is an error, to help prevent typos - -sub populate_trash { - print $OUT ">>> TRASHTAGS FILE FOUND. LOADING TAGS TO BE STRIPPED FROM OUTPUT\n"; - open TRASH, '<', $conf->{trashfile} - or die "Can't open trash tags file!\n"; - while () { - my $lastwasrange = 0; - my %lastrange = ( high => 0, low => 0); - my $except = 0; - - my @chunks = split /\s+/; - while (my $chunk = shift @chunks) { - - # single values - if ($chunk =~ /^\d{1,3}$/) { - trash_add($chunk, $except); - $lastwasrange = 0; - next; - } - - # ranges - if ($chunk =~ /^\d{1,3}\.\.\d{1,3}$/) { - my ($low, $high) = trash_add_range($chunk, $except, \%lastrange); - $lastwasrange = 1; - %lastrange = (low => $low, high => $high) - unless $except; - next; - } - - # 'except' - if ($chunk eq 'except') { - die "Keyword 'except' can only follow a range (line $.)\n" - unless $lastwasrange; - die "Keyword 'except' may only occur once per line (line $.)\n" - if $except; - $except = 1; - next; - } - - die "Unknown chunk $chunk in .trashtags file (line $.)\n"; - } - } - - # remove original id sequence tag from trash hash if we know it - trash_add($conf->{'original-tag'}, 1) - if ($conf->{'original-tag'} and $conf->{trash}{ $conf->{'original-tag'} }); -} - -sub trash_add_range { - my ($chunk, $except, $range) = @_; - my ($low,$high) = split /\.\./, $chunk; - die "Ranges must be 'low..high' ($low is greater than $high on line $.)\n" - if ($low > $high); - if ($except) { - die "Exception ranges must be within last addition range (line $.)\n" - if ($low < $range->{low} or $high > $range->{high}); - } - for my $tag ($low..$high) { - trash_add($tag, $except) - } - return $low, $high; -} - -sub trash_add { - my ($tag, $except) = @_; - my $trash = $conf->{trash}; - - die "Trash values must be valid tags (000-999)\n" - unless ($tag >= 0 and $tag <= 999); - - if ($except) { - delete $trash->{$tag}; - } else { - die "Trash tag '$tag' specified twice (line $.)\n" - if $trash->{$tag}; - $trash->{$tag} = 1; - } -} - -#----------------------------------------------------------------------- - -=head2 initialize - -Performs boring script initialization. Handles argument parsing, -mostly. - -=cut - -sub initialize { - my ($c) = @_; - my @missing = (); - - # set mode on existing filehandles - binmode(STDIN, ':utf8'); - - my $rc = GetOptions( $c, - 'autoscrub|a', - 'exception|x=s', - 'output|o=s', - 'prefix|p=s', - 'nocollapse|n', - 'renumber-from|rf=i', - 'renumber-tag|rt=i', - 'renumber-subfield|rs=s', - 'original-tag|ot=i', - 'original-subfield|os=s', - 'script', - 'strip-nines', - 'trashfile|t=s', - 'trashhelp', - 'help|h', - ); - show_help() unless $rc; - show_help() if ($c->{help}); - show_trashhelp() if ($c->{trashhelp}); - - # defaults - if ($c->{prefix}) { - $c->{output} = join('.',$c->{prefix},'marc','xml'); - $c->{exception} = join('.',$c->{prefix},'marc','ex'); - } - $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'}; - $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'}; - $c->{window} = 5; - - # autotrash 901, 903 if strip-nines - if ($c->{'strip-nines'}) { - $c->{trash}{901} = 1; - $c->{trash}{903} = 1; - } - - my @keys = keys %{$c}; - show_help() unless (@ARGV and @keys); -} - -sub show_help { - print < -Options - --output -o Cleaned MARCXML output filename - --exception -x Exception (dumped records) MARCXML filename - or - --prefix=> -p Shared prefix for output/exception files. Will - produce PREFIX.marc.xml and PREFIX.ex.xml - - --trashfile -t File containing trash tag data (see --trashhelp) - - --renumber-from -rf Begin renumbering id sequence with this number - --renumber-tag -rt Tag to use in renumbering (default: 903) - --renumber-subfield -rs Subfield code to use in renumbering (default: a) - --original-tag -ot Original id tag; will be kept in output even if - it appears in the trash file - --original-subfield -os Original id subfield code. If this is specified - and renumbering is in effect, an old-to-new mapping - file (old2new.map) will be generated. - - --nocollapse -n Don't compress records to one line on output - --autoscrub -a Automatically remove non-numeric tags in data - --strip-nines Automatically remove any existing 901/903 tags in data - - --script Store human-initiated ops in scriptfile (.mcscript) - Not yet implemented -HELP -exit; -} - -sub show_trashhelp { - print <OUT || \*STDOUT; + +$| = 1; + +# initialization and setup +my $conf = {}; +initialize($conf); +populate_trash() if ($conf->{trashfile}); + +# set up files, since everything appears to be in order +open MARC, '<:utf8', (shift || 'incoming.marc.xml') + or die "Can't open input file $!\n"; +open my $NUMARC, '>:utf8', $conf->{output} + or die "Can't open output file $!\n"; +open my $OLD2NEW, '>', 'old2new.map' + if ($conf->{'renumber-from'} and $conf->{'original-subfield'}); +my $EXMARC = 'EX'; + + +my @record = (); # current record storage +my %recmeta = (); # metadata about current record +my $ptr = 0; # record index pointer + +# this is the dispatch table which drives command selection in +# edit(), below +my %commands = ( c => \&print_fullcontext, + n => \&next_line, + p => \&prev_line, + '<' => \&widen_window, + '>' => \&narrow_window, + d => \&display_lines, + o => \&insert_original, + k => \&kill_line, + y => \&yank_line, + f => \&flip_line, + m => \&merge_lines, + s => \&substitute, + t => \&commit_edit, + x => \&dump_record, + q => \&quit, + '?' => \&help, + h => \&help, + help => \&help, + ); + +my @spinner = qw(- / | \\); +my $sidx = 0; + +while ( buildrecord() ) { + unless ($conf->{ricount} % 100) { + print "\rWorking... ", $spinner[$sidx]; + $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1; + } + + do_automated_cleanups(); + + $ptr = 0; + until ($ptr == $#record) { + # naked ampersands + if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/) + { edit("Naked ampersand"); $ptr= 0; next } + + if ($record[$ptr] =~ /{autoscrub}; + next; + } + # test for existing 901/903 unless we're autocleaning them + unless ($conf->{'strip-nines'}) { + if ($match == 901 or $match == 903) { + edit("Incoming 901/903 found in data"); + next; + } + } + } + + # subfields can't be non-alphanumeric + if ($record[$ptr] =~ /\n"; +print $OUT "\nDone. \n"; + + +#----------------------------------------------------------------------------------- +# cleanup routines +#----------------------------------------------------------------------------------- + +sub do_automated_cleanups { + $ptr = 0; + until ($ptr == $#record) { + # get datafield/tag data if we have it + stow_record_data(); + + # catch empty datafield elements + if ($record[$ptr] =~ m/|) { + my @a = @record[0 .. $ptr - 1]; + my @b = @record[$ptr + 2 .. $#record]; + @record = (@a, @b); + message("Empty datafield scrubbed"); + $ptr = 0; + next; + } + } + # and quasi-empty subfields + if ($record[$ptr] =~ m|(.*?)(.+?)|) { + my $leader = $1; + if (length $leader < 24) { + $leader .= ' ' x (20 - length($leader)); + $leader .= "4500"; + $record[$ptr] = "$leader\n"; + message("Short leader padded"); + } + } + if ($record[$ptr] =~ m|(.+?)$content\n"; + message("Short 008 padded"); + } + } + + # clean misplaced dollarsigns + if ($record[$ptr] =~ m|c?\d+\.\d{2}|) { + $record[$ptr] =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|; + message("Dollar sign corrected"); + } + + # clean up tags with spaces in them + $record[$ptr] =~ s/tag=" /tag="00/g; + $record[$ptr] =~ s/tag=" /tag="0/g; + $record[$ptr] =~ s/tag="-/tag="0/g; + $record[$ptr] =~ s/tag="(\d\d) /tag="0$1/g; + + # automatable subfield maladies + $record[$ptr] =~ s/code=" ">c/code="c">/; + $record[$ptr] =~ s/code=" ">\$/code="c"$>/; + } +} + +sub stow_record_data { + # get tag data if we're looking at it + + if ($record[$ptr] =~ m/{'original-subfield'} and + $recmeta{tag} == $conf->{'original-tag'}) { + my $line = $record[$ptr]; my $lptr = $ptr; + my $osub = $conf->{'original-subfield'}; + $recmeta{oid} = 'NONE'; + + until ($line =~ m||) { + $lptr++; + $line = $record[$lptr]; + $recmeta{oid} = $+{TAG} + if ($line =~ /(.+?){trash}{ $recmeta{tag} }; + $conf->{editmsg} = $msg; + print_fullcontext(); + + # stow original problem line + $recmeta{origline} = $record[$ptr]; + + while (1) { + my $line = $term->readline('marc-cleanup>'); + my @chunks = split /\s+/, $line; + + # lines with single-character first chunks are commands. + # make sure they exist. + if (length $chunks[0] == 1) { + unless (defined $commands{$chunks[0]}) { + print $OUT "No such command '", $chunks[0], "'\n"; + next; + } + } + + if (defined $commands{$chunks[0]}) { + my $term = $commands{$chunks[0]}->(@chunks[1..$#chunks]); + last if $term; + } else { + $recmeta{prevline} = $record[$ptr]; + $record[$ptr] = "$line\n"; + print_context(); + } + } + # set pointer to top on the way out + $ptr = 0; +} + +=head2 buildrecord + +Constructs record arrays from the incoming MARC file and returns them +to the driver loop. + +=cut + +sub buildrecord { + my $l = ''; + $l = while (defined $l and $l !~ //); + return $l unless defined $l; + @record = (); + %recmeta = (); + $conf->{ricount}++; + + until ($l =~ m||) + { push @record, $l; $l = ; } + push @record, $l; + return 1; +} + +sub write_record { + my ($FH) = @_; + my $trash = $conf->{trash}; + + if ($FH eq 'EX') { + $EXMARC = undef; + open $EXMARC, '>:utf8', $conf->{exception} + or die "Can't open exception file $!\n"; + $FH = $EXMARC; + } + + $conf->{rocount}++ if ($FH eq $NUMARC); + print $FH '\n" + if(defined $recmeta{explanation}); + + # excise unwanted tags + if (keys %{$trash} or $conf->{autoscrub}) { + my @trimmed = (); + my $istrash = 0; + for my $line (@record) { + if ($istrash) { + $istrash = 0 if $line =~ m|{$tag} or ($conf->{autoscrub} and $tag =~ /\D/)) { + $istrash = 1; + next + } + } + push @trimmed, $line; + } + @record = @trimmed; + } + + # add 903(?) with new record id + my $renumber = ''; + if ($conf->{'renumber-from'}) { + $recmeta{nid} = $conf->{'renumber-from'}; + $renumber = join('', ' ', $recmeta{nid}, "\n"); + my @tmp = @record[0 .. $#record - 1]; + my $last = $record[$#record]; + @record = (@tmp, $renumber, $last); + $conf->{'renumber-from'}++; + } + + # scrub newlines (unless told not to or writing exception record) + unless ($conf->{nocollapse} or $FH eq $EXMARC) + { s/\n// for (@record) } + + # write to old->new map file if needed + if ($conf->{'renumber-from'} and $conf->{'original-subfield'}) { + unless (defined $recmeta{oid}) { + my $msg = join(' ', "No old id num found"); + dump_record($msg); + } else { + print $OLD2NEW $recmeta{oid}, "\t", $recmeta{nid}, "\n" + } + } + + # actually write the record + print $FH @record,"\n"; + + # if we were dumping to exception file, nuke the record and set ptr + # to terminate processing loop + @record = ('a'); + $ptr = 0; +} + +sub print_fullcontext { + print $OUT "\r", ' ' x 72, "\n"; + print $OUT $conf->{editmsg},"\n"; + print $OUT "\r Tag:",$recmeta{tag}, " Ind1:'", + $recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'"; + print $OUT " @ ", $conf->{ricount}, "/", $conf->{rocount} + 1; + print_context(); + return 0; +} + +sub print_context { + my $upper = int($conf->{window} / 2) + 1; + my $lower = int($conf->{window} / 2) - 1; + my $start = ($ptr - $upper < 0) ? 0 : $ptr - $upper; + my $stop = ($ptr + $lower > $#record) ? $#record : $ptr + $lower; + print $OUT "\n"; + print $OUT ' |', $record[$_] for ($start .. $ptr - 1); + print $OUT '==> |', $record[$ptr]; + print $OUT ' |', $record[$_] for ($ptr + 1 .. $stop); + print $OUT "\n"; + return 0; +} + +sub message { + my ($msg) = @_; + print $OUT "\r$msg at ",$conf->{ricount},"/",$conf->{rocount} + 1,"\n"; +} + +#----------------------------------------------------------------------------------- +# command routines +#----------------------------------------------------------------------------------- + +sub substitute { + my (@chunks) = @_; + + my $ofrom = shift @chunks; + if ($ofrom =~ /^'/) { + until ($ofrom =~ /'$/ or !@chunks) + { $ofrom .= join(' ','',shift @chunks) } + $ofrom =~ s/^'//; $ofrom =~ s/'$//; + } + my $to = shift @chunks; + if ($to =~ /^'/) { + until ($to =~ /'$/ or !@chunks) + { $to .= join(' ','',shift @chunks) } + $to =~ s/^'//; $to =~ s/'$//; + } + + my $from = ''; + for my $char (split(//,$ofrom)) { + $char = "\\" . $char if ($char =~ /\W/); + $from = join('', $from, $char); + } + + $recmeta{prevline} = $record[$ptr]; + $record[$ptr] =~ s/$from/$to/; + print_context(); + return 0; +} + +sub merge_lines { + $recmeta{prevline} = $record[$ptr]; + # remove //; + # and move to front of line + $record[$ptr] = join(' ', $1 , $record[$ptr]); + # tear off trailing subfield tag from preceeding line + $record[$ptr - 1] =~ s|\n||; + # join current line onto preceeding line + $record[$ptr - 1] = join('', $record[$ptr - 1], $record[$ptr]); + # erase current line + my @a = @record[0 .. $ptr - 1]; + my @b = @record[$ptr + 1 .. $#record]; + @record = (@a, @b); + # move record pointer to previous line + prev_line(); + print_context(); + return 0; +} + +sub flip_line { + unless ($recmeta{prevline}) + { print $OUT "No previously edited line to flip\n"; return } + my $temp = $record[$ptr]; + $record[$ptr] = $recmeta{prevline}; + $recmeta{prevline} = $temp; + print_context(); + return 0; +} + +sub kill_line { + $recmeta{killline} = $record[$ptr]; + my @a = @record[0 .. $ptr - 1]; + my @b = @record[$ptr + 1 .. $#record]; + @record = (@a, @b); + print_context(); + return 0; +} + +sub yank_line { + unless ($recmeta{killline}) + { print $OUT "No killed line to yank\n"; return } + my @a = @record[0 .. $ptr - 1]; + my @b = @record[$ptr .. $#record]; + @record = (@a, $conf->{killline}, @b); + print_context(); + return 0; +} + +sub insert_original { + $record[$ptr] = $recmeta{origline}; + print_context(); + return 0; +} + +sub display_lines { + print $OUT "\nOrig. edit line :", $recmeta{origline}; + print $OUT "Current flip line:", $recmeta{prevline} if $recmeta{prevline}; + print $OUT "Last killed line :", $recmeta{killline} if $recmeta{killline}; + print $OUT "\n"; + return 0; +} + +sub dump_record { + my (@explanation) = @_; + print $OUT @explanation; + $recmeta{explanation} = join(' ', 'Tag', $recmeta{tag}, @explanation); + write_record($EXMARC); + return 1; +} + +sub next_line { + $ptr++ unless ($ptr == $#record);; + print_context(); + return 0; +} + +sub prev_line { + $ptr-- unless ($ptr == 0); + print_context(); + return 0; +} + +sub commit_edit { return 1 } + +sub widen_window { + if ($conf->{window} == 15) + { print $OUT "Window can't be bigger than 15 lines\n"; return } + $conf->{window} += 2; + print_context; +} + +sub narrow_window { + if ($conf->{window} == 5) + { print $OUT "Window can't be smaller than 5 lines\n"; return } + $conf->{window} -= 2; + print_context; +} + +sub help { +print $OUT < Contract context window | y Yank last killed line +p Move pointer to prev line | m Merge current line into preceding line +n Move pointer to next line | o Insert original line +c Print line context | f Flip current line and last edited line +d Print current saved lines | +-----------------------------+------------------------------------------- +s Subtitute; replace ARG1 in current line with ARG2. If either ARG + contains spaces, it must be single-quoted +t Commit changes and resume automated operations +x Dump record to exception file +q Quit + +HELP +return 0; +} + +sub quit { exit } + +#----------------------------------------------------------------------------------- +# populate_trash +#----------------------------------------------------------------------------------- +# defined a domain-specific language for specifying MARC tags to be dropped from +# records during processing. it is line oriented, and is specified as follows: +# +# each line may specify any number of tags to be included, either singly (\d{1,3}) +# or as a range (\d{1,3}\.\.\d{1,3} +# +# if a single number is given, it must be between '000' and '999', inclusive. +# +# ranges obey the previous rule, and also the first number of the range must be less +# than the second number +# +# finally, any single range in a line may be followed by the keyword 'except'. every +# number or range after 'except' is excluded from the range specified. all these +# numbers must actually be within the range. +# +# specifying a tag twice is an error, to help prevent typos + +sub populate_trash { + print $OUT ">>> TRASHTAGS FILE FOUND. LOADING TAGS TO BE STRIPPED FROM OUTPUT\n"; + open TRASH, '<', $conf->{trashfile} + or die "Can't open trash tags file!\n"; + while () { + my $lastwasrange = 0; + my %lastrange = ( high => 0, low => 0); + my $except = 0; + + my @chunks = split /\s+/; + while (my $chunk = shift @chunks) { + + # single values + if ($chunk =~ /^\d{1,3}$/) { + trash_add($chunk, $except); + $lastwasrange = 0; + next; + } + + # ranges + if ($chunk =~ /^\d{1,3}\.\.\d{1,3}$/) { + my ($low, $high) = trash_add_range($chunk, $except, \%lastrange); + $lastwasrange = 1; + %lastrange = (low => $low, high => $high) + unless $except; + next; + } + + # 'except' + if ($chunk eq 'except') { + die "Keyword 'except' can only follow a range (line $.)\n" + unless $lastwasrange; + die "Keyword 'except' may only occur once per line (line $.)\n" + if $except; + $except = 1; + next; + } + + die "Unknown chunk $chunk in .trashtags file (line $.)\n"; + } + } + + # remove original id sequence tag from trash hash if we know it + trash_add($conf->{'original-tag'}, 1) + if ($conf->{'original-tag'} and $conf->{trash}{ $conf->{'original-tag'} }); +} + +sub trash_add_range { + my ($chunk, $except, $range) = @_; + my ($low,$high) = split /\.\./, $chunk; + die "Ranges must be 'low..high' ($low is greater than $high on line $.)\n" + if ($low > $high); + if ($except) { + die "Exception ranges must be within last addition range (line $.)\n" + if ($low < $range->{low} or $high > $range->{high}); + } + for my $tag ($low..$high) { + trash_add($tag, $except) + } + return $low, $high; +} + +sub trash_add { + my ($tag, $except) = @_; + my $trash = $conf->{trash}; + + die "Trash values must be valid tags (000-999)\n" + unless ($tag >= 0 and $tag <= 999); + + if ($except) { + delete $trash->{$tag}; + } else { + die "Trash tag '$tag' specified twice (line $.)\n" + if $trash->{$tag}; + $trash->{$tag} = 1; + } +} + +#----------------------------------------------------------------------- + +=head2 initialize + +Performs boring script initialization. Handles argument parsing, +mostly. + +=cut + +sub initialize { + my ($c) = @_; + my @missing = (); + + # set mode on existing filehandles + binmode(STDIN, ':utf8'); + + my $rc = GetOptions( $c, + 'autoscrub|a', + 'exception|x=s', + 'output|o=s', + 'prefix|p=s', + 'nocollapse|n', + 'renumber-from|rf=i', + 'renumber-tag|rt=i', + 'renumber-subfield|rs=s', + 'original-tag|ot=i', + 'original-subfield|os=s', + 'script', + 'strip-nines', + 'trashfile|t=s', + 'trashhelp', + 'help|h', + ); + show_help() unless $rc; + show_help() if ($c->{help}); + show_trashhelp() if ($c->{trashhelp}); + + # defaults + if ($c->{prefix}) { + $c->{output} = join('.',$c->{prefix},'marc','xml'); + $c->{exception} = join('.',$c->{prefix},'marc','ex'); + } + $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'}; + $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'}; + $c->{window} = 5; + + # autotrash 901, 903 if strip-nines + if ($c->{'strip-nines'}) { + $c->{trash}{901} = 1; + $c->{trash}{903} = 1; + } + + my @keys = keys %{$c}; + show_help() unless (@ARGV and @keys); +} + +sub show_help { + print < +Options + --output -o Cleaned MARCXML output filename + --exception -x Exception (dumped records) MARCXML filename + or + --prefix=> -p Shared prefix for output/exception files. Will + produce PREFIX.marc.xml and PREFIX.ex.xml + + --trashfile -t File containing trash tag data (see --trashhelp) + + --renumber-from -rf Begin renumbering id sequence with this number + --renumber-tag -rt Tag to use in renumbering (default: 903) + --renumber-subfield -rs Subfield code to use in renumbering (default: a) + --original-tag -ot Original id tag; will be kept in output even if + it appears in the trash file + --original-subfield -os Original id subfield code. If this is specified + and renumbering is in effect, an old-to-new mapping + file (old2new.map) will be generated. + + --nocollapse -n Don't compress records to one line on output + --autoscrub -a Automatically remove non-numeric tags in data + --strip-nines Automatically remove any existing 901/903 tags in data + + --script Store human-initiated ops in scriptfile (.mcscript) + Not yet implemented +HELP +exit; +} + +sub show_trashhelp { + print <