X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=marc_cleanup;h=e2a9390ecf6a33ce6cf52dc91b5664faf2989c36;hp=8438a06242d3d9f13fb0cd773f31dcbbbff2ee9f;hb=9e967d7a4207176f0778f0e296ab39d8223eb431;hpb=5fae15058cb2f180418178291abc6760feead258 diff --git a/marc_cleanup b/marc_cleanup index 8438a06..e2a9390 100755 --- a/marc_cleanup +++ b/marc_cleanup @@ -1,34 +1,58 @@ #!/usr/bin/perl +# Copyright 2009-2012, Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +require 5.10.0; + use strict; use warnings; use Getopt::Long; use Term::ReadLine; +use Equinox::Migration::SimpleTagList; my $term = new Term::ReadLine 'yaz-cleanup'; my $OUT = $term->OUT || \*STDOUT; +binmode STDOUT, ":utf8"; +binmode $OUT, ":utf8"; $| = 1; # initialization and setup -my $conf = {}; -initialize($conf); -populate_trash() if ($conf->{trashfile}); +my $c = {}; +initialize($c); # set up files, since everything appears to be in order -open MARC, '<:utf8', (shift || 'incoming.marc.xml') +open MARC, '<:utf8', $c->{marcfile} or die "Can't open input file $!\n"; -open my $NUMARC, '>:utf8', $conf->{output} +open my $NUMARC, '>:utf8', $c->{output} or die "Can't open output file $!\n"; open my $OLD2NEW, '>', 'old2new.map' - if ($conf->{'renumber-from'} and $conf->{'original-subfield'}); + if ($c->{'renumber-from'} and $c->{'original-tag'}); my $EXMARC = 'EX'; +print $NUMARC "\n"; +$c->{totalrecs} = `grep -c '{marcfile}`; +chomp $c->{totalrecs}; +$c->{percent} = 0; -my @record = (); # current record storage -my %recmeta = (); # metadata about current record -my $ptr = 0; # record index pointer +my @record; # current record storage +my %recmeta; # metadata about current record +my $ptr = 0; # record index pointer # this is the dispatch table which drives command selection in # edit(), below @@ -52,19 +76,25 @@ my %commands = ( c => \&print_fullcontext, help => \&help, ); -my @spinner = qw(- / | \\); +my @spinner = qw(- \\ | /); my $sidx = 0; while ( buildrecord() ) { - unless ($conf->{ricount} % 100) { - print "\rWorking... ", $spinner[$sidx]; + unless ($c->{ricount} % 50) { + $c->{percent} = int(($c->{ricount} / $c->{totalrecs}) * 100); + print "\rWorking (",$c->{percent},"%) ", $spinner[$sidx]; $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1; } - do_automated_cleanups(); + my $rc = do_automated_cleanups(); + next if $rc; $ptr = 0; until ($ptr == $#record) { + # get datafield/tag data if we have it + $rc = stow_record_data() if ($c->{'renumber-from'} and $c->{'original-tag'}); + next if $rc; + # naked ampersands if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/) { edit("Naked ampersand"); $ptr= 0; next } @@ -73,33 +103,30 @@ while ( buildrecord() ) { my $match = $1; # tags must be numeric if ($match =~ /\D/) { - edit("Non-numerics in tag") unless $conf->{autoscrub}; + edit("Non-numerics in tag") unless $c->{autoscrub}; next; } - # test for existing 901/903 unless we're autocleaning them - unless ($conf->{'strip-nines'}) { - if ($match == 901 or $match == 903) { - edit("Incoming 901/903 found in data"); - next; - } - } } # subfields can't be non-alphanumeric if ($record[$ptr] =~ /\n"; -print $OUT "\nDone. \n"; +print $NUMARC "\n"; +print $OUT "\nDone. ",$c->{ricount}," in; ",$c->{rocount}," dumped \n"; #----------------------------------------------------------------------------------- @@ -109,8 +136,6 @@ print $OUT "\nDone. \n"; sub do_automated_cleanups { $ptr = 0; until ($ptr == $#record) { - # get datafield/tag data if we have it - stow_record_data(); # catch empty datafield elements if ($record[$ptr] =~ m/|) { + $record[$ptr] =~ s|\s{10,}||; + message("Trailing whitespace trimmed from subfield contents"); + } # automatable subfield maladies $record[$ptr] =~ s/code=" ">c/code="c">/; - $record[$ptr] =~ s/code=" ">\$/code="c"$>/; + $record[$ptr] =~ s/code=" ">\$/code="c">\$/; + + if ($c->{'fix-subfield'}) { + $record[$ptr] =~ s/code="&">/code="$c->{'fix-subfield'}">/; + $record[$ptr] =~ s/code="\P{IsAlnum}">/code="$c->{'fix-subfield'}">/; + $record[$ptr] =~ s/code="">/code="$c->{'fix-subfield'}">/; + } } + return 0; } sub stow_record_data { # get tag data if we're looking at it - - if ($record[$ptr] =~ m/{'original-subfield'} and - $recmeta{tag} == $conf->{'original-tag'}) { - my $line = $record[$ptr]; my $lptr = $ptr; - my $osub = $conf->{'original-subfield'}; - $recmeta{oid} = 'NONE'; - - until ($line =~ m||) { - $lptr++; - $line = $record[$lptr]; - $recmeta{oid} = $+{TAG} - if ($line =~ /(.+?){'original-tag'}) { + my $oid = 0; + if ($tag < 10) { + # controlfield + if ($record[$ptr] =~ m|(.+?)|) + { $oid = $1; print $OLD2NEW "$oid\t", $recmeta{nid}, "\n" } + } elsif ($tag >= 10 and $c->{'original-subfield'}) { + # datafield + my $line = $record[$ptr]; my $lptr = $ptr; + my $osub = $c->{'original-subfield'}; + # skim to end of this tag + until ($line =~ m||) { + if ($line =~ /(.+?)new map file + if ($c->{'renumber-from'} and $c->{'original-subfield'}) { } + } } + return 0; } #----------------------------------------------------------------------------------- @@ -223,8 +280,11 @@ Handles the Term::ReadLine loop sub edit { my ($msg) = @_; - return if $conf->{trash}{ $recmeta{tag} }; - $conf->{editmsg} = $msg; + return if $c->{trash}->has( $recmeta{tag} ); + if ( $c->{fullauto} ) + { dump_record($msg); return } + + $c->{editmsg} = $msg; print_fullcontext(); # stow original problem line @@ -265,85 +325,80 @@ to the driver loop. sub buildrecord { my $l = ''; - $l = while (defined $l and $l !~ //); + my $istrash = 0; + my $trash = $c->{trash}; + + $l = while (defined $l and $l !~ /{ricount}++; + $c->{ricount}++; + + for (keys %recmeta) { $recmeta{$_} = undef } + for (0 .. @record) { delete $record[$_] } + + my $i = 0; + until ($l =~ m||) { + # clean up tags with spaces in them + $l =~ s/tag=" /tag="00/g; + $l =~ s/tag=" /tag="0/g; + $l =~ s/tag="-/tag="0/g; + $l =~ s/tag="(\d\d) /tag="0$1/g; + + # excise unwanted tags + if ($istrash) { + $istrash = 0 if ($l =~ m|; + next; + } + if ($l =~ m/has($1) or ($c->{autoscrub} and $1 =~ /\D/)) + { $istrash = 1; next } + } + + push @record, $l; + $l = ; + $i++; + } + + # add 903(?) with new record id + if ($c->{'renumber-from'}) { + $recmeta{nid} = $c->{'renumber-from'}; + push @record, join('', ' ', + $recmeta{nid}, + "\n"); + $c->{'renumber-from'}++; + } + $i++; - until ($l =~ m||) - { push @record, $l; $l = ; } push @record, $l; return 1; } sub write_record { my ($FH) = @_; - my $trash = $conf->{trash}; if ($FH eq 'EX') { $EXMARC = undef; - open $EXMARC, '>:utf8', $conf->{exception} + open $EXMARC, '>:utf8', $c->{exception} or die "Can't open exception file $!\n"; $FH = $EXMARC; } - $conf->{rocount}++ if ($FH eq $NUMARC); print $FH '\n" if(defined $recmeta{explanation}); - # excise unwanted tags - if (keys %{$trash} or $conf->{autoscrub}) { - my @trimmed = (); - my $istrash = 0; - for my $line (@record) { - if ($istrash) { - $istrash = 0 if $line =~ m|{$tag} or ($conf->{autoscrub} and $tag =~ /\D/)) { - $istrash = 1; - next - } - } - push @trimmed, $line; - } - @record = @trimmed; - } - - # add 903(?) with new record id - my $renumber = ''; - if ($conf->{'renumber-from'}) { - $recmeta{nid} = $conf->{'renumber-from'}; - $renumber = join('', ' ', $recmeta{nid}, "\n"); - my @tmp = @record[0 .. $#record - 1]; - my $last = $record[$#record]; - @record = (@tmp, $renumber, $last); - $conf->{'renumber-from'}++; - } - # scrub newlines (unless told not to or writing exception record) - unless ($conf->{nocollapse} or $FH eq $EXMARC) + unless ($c->{nocollapse} or $FH eq $EXMARC) { s/\n// for (@record) } - # write to old->new map file if needed - if ($conf->{'renumber-from'} and $conf->{'original-subfield'}) { - unless (defined $recmeta{oid}) { - my $msg = join(' ', "No old id num found"); - dump_record($msg); - } else { - print $OLD2NEW $recmeta{oid}, "\t", $recmeta{nid}, "\n" - } - } - # actually write the record print $FH @record,"\n"; + # increment output record count (if not exception) + $c->{rocount}++ if ($FH eq $EXMARC); + # if we were dumping to exception file, nuke the record and set ptr # to terminate processing loop @record = ('a'); @@ -352,17 +407,17 @@ sub write_record { sub print_fullcontext { print $OUT "\r", ' ' x 72, "\n"; - print $OUT $conf->{editmsg},"\n"; + print $OUT $c->{editmsg},"\n"; print $OUT "\r Tag:",$recmeta{tag}, " Ind1:'", $recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'"; - print $OUT " @ ", $conf->{ricount}, "/", $conf->{rocount} + 1; + print $OUT " @ ", $c->{ricount}, "/", $c->{totalrecs}; print_context(); return 0; } sub print_context { - my $upper = int($conf->{window} / 2) + 1; - my $lower = int($conf->{window} / 2) - 1; + my $upper = int($c->{window} / 2) + 1; + my $lower = int($c->{window} / 2) - 1; my $start = ($ptr - $upper < 0) ? 0 : $ptr - $upper; my $stop = ($ptr + $lower > $#record) ? $#record : $ptr + $lower; print $OUT "\n"; @@ -375,7 +430,7 @@ sub print_context { sub message { my ($msg) = @_; - print $OUT "\r$msg at ",$conf->{ricount},"/",$conf->{rocount} + 1,"\n"; + print $OUT "\r$msg at ",$c->{ricount},"/",$c->{totalrecs}, "\n"; } #----------------------------------------------------------------------------------- @@ -406,6 +461,7 @@ sub substitute { $recmeta{prevline} = $record[$ptr]; $record[$ptr] =~ s/$from/$to/; + $ofrom = undef; $to = undef; $from = undef; print_context(); return 0; } @@ -436,6 +492,7 @@ sub flip_line { my $temp = $record[$ptr]; $record[$ptr] = $recmeta{prevline}; $recmeta{prevline} = $temp; + $temp = undef; print_context(); return 0; } @@ -445,6 +502,7 @@ sub kill_line { my @a = @record[0 .. $ptr - 1]; my @b = @record[$ptr + 1 .. $#record]; @record = (@a, @b); + @a = undef; @b = undef; print_context(); return 0; } @@ -454,7 +512,8 @@ sub yank_line { { print $OUT "No killed line to yank\n"; return } my @a = @record[0 .. $ptr - 1]; my @b = @record[$ptr .. $#record]; - @record = (@a, $conf->{killline}, @b); + @record = (@a, $c->{killline}, @b); + @a = undef; @b = undef; print_context(); return 0; } @@ -475,8 +534,8 @@ sub display_lines { sub dump_record { my (@explanation) = @_; - print $OUT @explanation; - $recmeta{explanation} = join(' ', 'Tag', $recmeta{tag}, @explanation); + $recmeta{explanation} = join(' ', 'DUMPING RECORD: Tag', $recmeta{tag}, @explanation); + message( $recmeta{explanation} ); write_record($EXMARC); return 1; } @@ -496,16 +555,16 @@ sub prev_line { sub commit_edit { return 1 } sub widen_window { - if ($conf->{window} == 15) + if ($c->{window} == 15) { print $OUT "Window can't be bigger than 15 lines\n"; return } - $conf->{window} += 2; + $c->{window} += 2; print_context; } sub narrow_window { - if ($conf->{window} == 5) + if ($c->{window} == 5) { print $OUT "Window can't be smaller than 5 lines\n"; return } - $conf->{window} -= 2; + $c->{window} -= 2; print_context; } @@ -533,104 +592,6 @@ return 0; sub quit { exit } -#----------------------------------------------------------------------------------- -# populate_trash -#----------------------------------------------------------------------------------- -# defined a domain-specific language for specifying MARC tags to be dropped from -# records during processing. it is line oriented, and is specified as follows: -# -# each line may specify any number of tags to be included, either singly (\d{1,3}) -# or as a range (\d{1,3}\.\.\d{1,3} -# -# if a single number is given, it must be between '000' and '999', inclusive. -# -# ranges obey the previous rule, and also the first number of the range must be less -# than the second number -# -# finally, any single range in a line may be followed by the keyword 'except'. every -# number or range after 'except' is excluded from the range specified. all these -# numbers must actually be within the range. -# -# specifying a tag twice is an error, to help prevent typos - -sub populate_trash { - print $OUT ">>> TRASHTAGS FILE FOUND. LOADING TAGS TO BE STRIPPED FROM OUTPUT\n"; - open TRASH, '<', $conf->{trashfile} - or die "Can't open trash tags file!\n"; - while () { - my $lastwasrange = 0; - my %lastrange = ( high => 0, low => 0); - my $except = 0; - - my @chunks = split /\s+/; - while (my $chunk = shift @chunks) { - - # single values - if ($chunk =~ /^\d{1,3}$/) { - trash_add($chunk, $except); - $lastwasrange = 0; - next; - } - - # ranges - if ($chunk =~ /^\d{1,3}\.\.\d{1,3}$/) { - my ($low, $high) = trash_add_range($chunk, $except, \%lastrange); - $lastwasrange = 1; - %lastrange = (low => $low, high => $high) - unless $except; - next; - } - - # 'except' - if ($chunk eq 'except') { - die "Keyword 'except' can only follow a range (line $.)\n" - unless $lastwasrange; - die "Keyword 'except' may only occur once per line (line $.)\n" - if $except; - $except = 1; - next; - } - - die "Unknown chunk $chunk in .trashtags file (line $.)\n"; - } - } - - # remove original id sequence tag from trash hash if we know it - trash_add($conf->{'original-tag'}, 1) - if ($conf->{'original-tag'} and $conf->{trash}{ $conf->{'original-tag'} }); -} - -sub trash_add_range { - my ($chunk, $except, $range) = @_; - my ($low,$high) = split /\.\./, $chunk; - die "Ranges must be 'low..high' ($low is greater than $high on line $.)\n" - if ($low > $high); - if ($except) { - die "Exception ranges must be within last addition range (line $.)\n" - if ($low < $range->{low} or $high > $range->{high}); - } - for my $tag ($low..$high) { - trash_add($tag, $except) - } - return $low, $high; -} - -sub trash_add { - my ($tag, $except) = @_; - my $trash = $conf->{trash}; - - die "Trash values must be valid tags (000-999)\n" - unless ($tag >= 0 and $tag <= 999); - - if ($except) { - delete $trash->{$tag}; - } else { - die "Trash tag '$tag' specified twice (line $.)\n" - if $trash->{$tag}; - $trash->{$tag} = 1; - } -} - #----------------------------------------------------------------------- =head2 initialize @@ -649,8 +610,10 @@ sub initialize { my $rc = GetOptions( $c, 'autoscrub|a', + 'fullauto', 'exception|x=s', 'output|o=s', + 'marcfile|m=s', 'prefix|p=s', 'nocollapse|n', 'renumber-from|rf=i', @@ -658,8 +621,9 @@ sub initialize { 'renumber-subfield|rs=s', 'original-tag|ot=i', 'original-subfield|os=s', + 'fix-subfield|fs=s', 'script', - 'strip-nines', + 'no-strip9', 'trashfile|t=s', 'trashhelp', 'help|h', @@ -669,35 +633,48 @@ sub initialize { show_trashhelp() if ($c->{trashhelp}); # defaults - if ($c->{prefix}) { - $c->{output} = join('.',$c->{prefix},'clean','marc','xml'); - $c->{exception} = join('.',$c->{prefix},'marc','ex'); - } + my $pfx = defined($c->{prefix}) ? $c->{prefix} : "bibs"; + $c->{ricount} = 0; + $c->{rocount} = 0; $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'}; $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'}; - $c->{window} = 5; - - # autotrash 901, 903 if strip-nines - if ($c->{'strip-nines'}) { - $c->{trash}{901} = 1; - $c->{trash}{903} = 1; + $c->{window} = 9; + if ($c->{prefix}) { + $c->{output} = join('.',$c->{prefix},'clean','marc','xml') + unless $c->{output}; + $c->{exception} = join('.',$c->{prefix},'exception','marc','xml') + unless $c->{exception}; + $c->{marcfile} = $c->{prefix} . '.marc.xml' + unless $c->{marcfile}; } + show_help() unless ($c->{marcfile} and $c->{output}); - my @keys = keys %{$c}; - show_help() unless (@ARGV and @keys); + if ($c->{trashfile}) { + $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $c->{trashfile}) + } else { + $c->{trash} = Equinox::Migration::SimpleTagList->new; + } + # autotrash 901, 903 unless no strip-nines + unless ($c->{'no-strip9'}) { + $c->{trash}->add_tag(901); + $c->{trash}->add_tag(903); + } + # remove original id sequence tag from trash hash if we know it + $c->{trash}->remove_tag($c->{'original-tag'}) + if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) ); } sub show_help { print < +Usage is: marc_cleanup [OPTIONS] Options --output -o Cleaned MARCXML output filename --exception -x Exception (dumped records) MARCXML filename or - --prefix=> -p Shared prefix for output/exception files. Will - produce PREFIX.marc.xml and PREFIX.ex.xml + --prefix= -p Shared prefix for output/exception files. Will produce + PREFIX.clean.marc.xml and PREFIX.exception.marc.xml - --trashfile -t File containing trash tag data (see --trashhelp) + --marcfile -m Input filename. Defaults to PREFIX.marc.xml --renumber-from -rf Begin renumbering id sequence with this number --renumber-tag -rt Tag to use in renumbering (default: 903) @@ -708,44 +685,27 @@ Options and renumbering is in effect, an old-to-new mapping file (old2new.map) will be generated. - --nocollapse -n Don't compress records to one line on output - --autoscrub -a Automatically remove non-numeric tags in data - --strip-nines Automatically remove any existing 901/903 tags in data + --autoscrub -a Automatically remove non-numeric tags in data + --fix-subfield -fs Subfield code to use in place of non-alphanumeric + or empty subfield codes + --nocollapse -n Don't compress records to one line on output + --no-strip9 Don't autoremove 901/903 tags in data + --trashfile -t File containing trash tag data (see --trashhelp) + + --fullauto No manual edits. All problematic records dumped to + exception file. - --script Store human-initiated ops in scriptfile (.mcscript) - Not yet implemented HELP exit; } sub show_trashhelp { print <