X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=marc_cleanup;h=58a68c163e0786ee2ef20ce154ce2de3f3b1e398;hp=fe25aac3db09bc3d318fd45e2eeadf480bc298b6;hb=9a3207573976451e12bce078decca416dc4b5c3c;hpb=500f1b27392ccafe7288aa9f5721f233fdb0e00c diff --git a/marc_cleanup b/marc_cleanup index fe25aac..58a68c1 100755 --- a/marc_cleanup +++ b/marc_cleanup @@ -1,4 +1,21 @@ #!/usr/bin/perl + +# Copyright 2009-2012, Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + require 5.10.0; use strict; @@ -16,23 +33,22 @@ binmode $OUT, ":utf8"; $| = 1; # initialization and setup -my $conf = {}; -initialize($conf); +my $c = {}; +initialize($c); # set up files, since everything appears to be in order -my $marcfile = shift || 'incoming.marc.xml'; -open MARC, '<:utf8', $marcfile +open MARC, '<:utf8', $c->{marcfile} or die "Can't open input file $!\n"; -open my $NUMARC, '>:utf8', $conf->{output} +open my $NUMARC, '>:utf8', $c->{output} or die "Can't open output file $!\n"; open my $OLD2NEW, '>', 'old2new.map' - if ($conf->{'renumber-from'} and $conf->{'original-subfield'}); + if ($c->{'renumber-from'} and $c->{'original-tag'}); my $EXMARC = 'EX'; print $NUMARC "\n"; -$conf->{totalrecs} = `grep -c '{totalrecs}; -$conf->{percent} = 0; +$c->{totalrecs} = `grep -c '{marcfile}`; +chomp $c->{totalrecs}; +$c->{percent} = 0; my @record; # current record storage my %recmeta; # metadata about current record @@ -64,9 +80,9 @@ my @spinner = qw(- \\ | /); my $sidx = 0; while ( buildrecord() ) { - unless ($conf->{ricount} % 50) { - $conf->{percent} = int(($conf->{ricount} / $conf->{totalrecs}) * 100); - print "\rWorking (",$conf->{percent},"%) ", $spinner[$sidx]; + unless ($c->{ricount} % 50) { + $c->{percent} = int(($c->{ricount} / $c->{totalrecs}) * 100); + print "\rWorking (",$c->{percent},"%) ", $spinner[$sidx]; $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1; } @@ -76,8 +92,8 @@ while ( buildrecord() ) { $ptr = 0; until ($ptr == $#record) { # get datafield/tag data if we have it - my $rc = stow_record_data(); - return $rc if $rc; + $rc = stow_record_data() if ($c->{'renumber-from'} and $c->{'original-tag'}); + next if $rc; # naked ampersands if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/) @@ -87,7 +103,7 @@ while ( buildrecord() ) { my $match = $1; # tags must be numeric if ($match =~ /\D/) { - edit("Non-numerics in tag") unless $conf->{autoscrub}; + edit("Non-numerics in tag") unless $c->{autoscrub}; next; } } @@ -95,11 +111,11 @@ while ( buildrecord() ) { # subfields can't be non-alphanumeric if ($record[$ptr] =~ /\n"; -print $OUT "\nDone. ",$conf->{ricount}," in / ",$conf->{rocount}," out \n"; +print $OUT "\nDone. ",$c->{ricount}," in; ",$c->{rocount}," dumped \n"; #----------------------------------------------------------------------------------- @@ -161,6 +177,11 @@ sub do_automated_cleanups { message("Short leader padded"); } } + if ($c->{'force-utf8'}) { + if ($record[$ptr] =~ m|(.........).(.+)|) { + $record[$ptr] = "$1a$2\n"; + } + } if ($record[$ptr] =~ m|(.+?)|) { + $record[$ptr] =~ s|\s{10,}||; + message("Trailing whitespace trimmed from subfield contents"); + } + # automatable subfield maladies - $record[$ptr] =~ s/code=" ">c/code="c">/; - $record[$ptr] =~ s/code=" ">\$/code="c">\$/; + if ($record[$ptr] =~ /code=" ">c/) { + message('Fixing probable subfield c, scenario 1'); + $record[$ptr] =~ s/code=" ">c/code="c">/; + } + if ($record[$ptr] =~ /code=" ">\$/) { + message('Fixing probable subfield c, scenario 2'); + $record[$ptr] =~ s/code=" ">\$/code="c">\$/; + } + + if ($c->{'fix-subfield'}) { + if ($record[$ptr] =~ /code="&">/) { + message('Fixing & for subfield code'); + $record[$ptr] =~ s/code="&">/code="$c->{'fix-subfield'}">/; + } + if ($record[$ptr] =~ /code="(.*?\P{IsAlnum}.*?)">/) { + message("Fixing non-alphanumeric subfield code: $1 -> " . $c->{'fix-subfield'}); + $record[$ptr] =~ s/code=".*?\P{IsAlnum}.*?">/code="$c->{'fix-subfield'}">/; + } + if ($record[$ptr] =~ /code="">/) { + message('Fixing null subfield code'); + $record[$ptr] =~ s/code="">/code="$c->{'fix-subfield'}">/; + } + } } return 0; } sub stow_record_data { # get tag data if we're looking at it - if ($record[$ptr] =~ m/{'original-subfield'} and $recmeta{tag} == $conf->{'original-tag'}) { - my $line = $record[$ptr]; my $lptr = $ptr; - my $osub = $conf->{'original-subfield'}; - $recmeta{oid} = 'NONE'; - - # skim to end of this tag - until ($line =~ m||) { - if ($line =~ /(.+?){'original-tag'}) { + my $oid = 0; + if ($tag < 10) { + # controlfield + if ($record[$ptr] =~ m|(.+?)|) + { $oid = $1; print $OLD2NEW "$oid\t", $recmeta{nid}, "\n" } + } elsif ($tag >= 10 and $c->{'original-subfield'}) { + # datafield + my $line = $record[$ptr]; my $lptr = $ptr; + my $osub = $c->{'original-subfield'}; + # skim to end of this tag + until ($line =~ m||) { + if ($line =~ /(.+?)new map file + if ($c->{'renumber-from'} and $c->{'original-subfield'}) { + } + } } return 0; @@ -235,8 +300,11 @@ Handles the Term::ReadLine loop sub edit { my ($msg) = @_; - return if $conf->{trash}->has( $recmeta{tag} ); - $conf->{editmsg} = $msg; + return if $c->{trash}->has( $recmeta{tag} ); + if ( $c->{fullauto} ) + { dump_record($msg); return } + + $c->{editmsg} = $msg; print_fullcontext(); # stow original problem line @@ -278,11 +346,11 @@ to the driver loop. sub buildrecord { my $l = ''; my $istrash = 0; - my $trash = $conf->{trash}; + my $trash = $c->{trash}; - $l = while (defined $l and $l !~ //); + $l = while (defined $l and $l !~ /{ricount}++; + $c->{ricount}++; for (keys %recmeta) { $recmeta{$_} = undef } for (0 .. @record) { delete $record[$_] } @@ -302,15 +370,29 @@ sub buildrecord { next; } if ($l =~ m/has($1) or ($conf->{autoscrub} and $1 =~ /\D/)) + if ($trash->has($1) or ($c->{autoscrub} and $1 =~ /\D/)) { $istrash = 1; next } } - $record[$i] = $l; + push @record, $l; $l = ; $i++; } - $record[$i] = $l; + + # add 903(?) with new record id + if ($c->{'renumber-from'}) { + $recmeta{nid} = $c->{'renumber-from'}; + push @record, join('', ' ', + $recmeta{nid}, + "\n"); + $c->{'renumber-from'}++; + } + $i++; + + push @record, $l; return 1; } @@ -319,43 +401,24 @@ sub write_record { if ($FH eq 'EX') { $EXMARC = undef; - open $EXMARC, '>:utf8', $conf->{exception} + open $EXMARC, '>:utf8', $c->{exception} or die "Can't open exception file $!\n"; $FH = $EXMARC; } - $conf->{rocount}++ if ($FH eq $NUMARC); print $FH '\n" if(defined $recmeta{explanation}); - # add 903(?) with new record id - my $renumber = ''; - if ($conf->{'renumber-from'}) { - $recmeta{nid} = $conf->{'renumber-from'}; - $renumber = join('', ' ', $recmeta{nid}, "\n"); - my @tmp = @record[0 .. @record - 2]; - my $last = $record[-1]; - @record = undef; - @record = (@tmp, $renumber, $last); - @tmp = undef; $last = undef; - $conf->{'renumber-from'}++; - } - # scrub newlines (unless told not to or writing exception record) - unless ($conf->{nocollapse} or $FH eq $EXMARC) + unless ($c->{nocollapse} or $FH eq $EXMARC) { s/\n// for (@record) } - # write to old->new map file if needed - if ($conf->{'renumber-from'} and $conf->{'original-subfield'}) { - print $OLD2NEW $recmeta{oid}, "\t", $recmeta{nid}, "\n" - } - # actually write the record print $FH @record,"\n"; + # increment output record count (if not exception) + $c->{rocount}++ if ($FH eq $EXMARC); + # if we were dumping to exception file, nuke the record and set ptr # to terminate processing loop @record = ('a'); @@ -364,17 +427,17 @@ sub write_record { sub print_fullcontext { print $OUT "\r", ' ' x 72, "\n"; - print $OUT $conf->{editmsg},"\n"; + print $OUT $c->{editmsg},"\n"; print $OUT "\r Tag:",$recmeta{tag}, " Ind1:'", $recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'"; - print $OUT " @ ", $conf->{ricount}, "/", $conf->{rocount} + 1; + print $OUT " @ ", $c->{ricount}, "/", $c->{totalrecs}; print_context(); return 0; } sub print_context { - my $upper = int($conf->{window} / 2) + 1; - my $lower = int($conf->{window} / 2) - 1; + my $upper = int($c->{window} / 2) + 1; + my $lower = int($c->{window} / 2) - 1; my $start = ($ptr - $upper < 0) ? 0 : $ptr - $upper; my $stop = ($ptr + $lower > $#record) ? $#record : $ptr + $lower; print $OUT "\n"; @@ -387,7 +450,7 @@ sub print_context { sub message { my ($msg) = @_; - print $OUT "\r$msg at ",$conf->{ricount},"/",$conf->{rocount} + 1,"\n"; + print $OUT "\r$msg at ",$c->{ricount},"/",$c->{totalrecs}, "\n"; } #----------------------------------------------------------------------------------- @@ -469,7 +532,7 @@ sub yank_line { { print $OUT "No killed line to yank\n"; return } my @a = @record[0 .. $ptr - 1]; my @b = @record[$ptr .. $#record]; - @record = (@a, $conf->{killline}, @b); + @record = (@a, $c->{killline}, @b); @a = undef; @b = undef; print_context(); return 0; @@ -491,9 +554,8 @@ sub display_lines { sub dump_record { my (@explanation) = @_; - print $OUT @explanation; - $recmeta{explanation} = join(' ', 'Tag', $recmeta{tag}, @explanation); - @explanation = undef; + $recmeta{explanation} = join(' ', 'DUMPING RECORD: Tag', $recmeta{tag}, @explanation); + message( $recmeta{explanation} ); write_record($EXMARC); return 1; } @@ -513,16 +575,16 @@ sub prev_line { sub commit_edit { return 1 } sub widen_window { - if ($conf->{window} == 15) + if ($c->{window} == 15) { print $OUT "Window can't be bigger than 15 lines\n"; return } - $conf->{window} += 2; + $c->{window} += 2; print_context; } sub narrow_window { - if ($conf->{window} == 5) + if ($c->{window} == 5) { print $OUT "Window can't be smaller than 5 lines\n"; return } - $conf->{window} -= 2; + $c->{window} -= 2; print_context; } @@ -568,8 +630,10 @@ sub initialize { my $rc = GetOptions( $c, 'autoscrub|a', + 'fullauto', 'exception|x=s', 'output|o=s', + 'marcfile|m=s', 'prefix|p=s', 'nocollapse|n', 'renumber-from|rf=i', @@ -577,26 +641,37 @@ sub initialize { 'renumber-subfield|rs=s', 'original-tag|ot=i', 'original-subfield|os=s', + 'fix-subfield|fs=s', + 'force-utf8', 'script', 'no-strip9', 'trashfile|t=s', 'trashhelp', 'help|h', ); - show_help() unless $rc and @ARGV; + show_help() unless $rc; show_help() if ($c->{help}); show_trashhelp() if ($c->{trashhelp}); # defaults - my $pfx = $c->{prefix} // "bibs"; - $c->{output} = join('.',$c->{prefix},'clean','marc','xml'); - $c->{exception} = join('.',$c->{prefix},'exception','marc','xml'); + my $pfx = defined($c->{prefix}) ? $c->{prefix} : "bibs"; + $c->{ricount} = 0; + $c->{rocount} = 0; $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'}; $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'}; - $c->{window} = 5; + $c->{window} = 9; + if ($c->{prefix}) { + $c->{output} = join('.',$c->{prefix},'clean','marc','xml') + unless $c->{output}; + $c->{exception} = join('.',$c->{prefix},'exception','marc','xml') + unless $c->{exception}; + $c->{marcfile} = $c->{prefix} . '.marc.xml' + unless $c->{marcfile}; + } + show_help() unless ($c->{marcfile} and $c->{output}); if ($c->{trashfile}) { - $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $conf->{trashfile}) + $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $c->{trashfile}) } else { $c->{trash} = Equinox::Migration::SimpleTagList->new; } @@ -608,21 +683,20 @@ sub initialize { # remove original id sequence tag from trash hash if we know it $c->{trash}->remove_tag($c->{'original-tag'}) if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) ); - - my @keys = keys %{$c}; - show_help() unless (@ARGV and @keys); } sub show_help { print < +Usage is: marc_cleanup [OPTIONS] Options --output -o Cleaned MARCXML output filename --exception -x Exception (dumped records) MARCXML filename or - --prefix=> -p Shared prefix for output/exception files. Will produce + --prefix= -p Shared prefix for output/exception files. Will produce PREFIX.clean.marc.xml and PREFIX.exception.marc.xml + --marcfile -m Input filename. Defaults to PREFIX.marc.xml + --renumber-from -rf Begin renumbering id sequence with this number --renumber-tag -rt Tag to use in renumbering (default: 903) --renumber-subfield -rs Subfield code to use in renumbering (default: a) @@ -632,14 +706,18 @@ Options and renumbering is in effect, an old-to-new mapping file (old2new.map) will be generated. - --autoscrub -a Automatically remove non-numeric tags in data - --nocollapse -n Don't compress records to one line on output - --no-strip9 Don't autoremove 901/903 tags in data - --trashfile -t File containing trash tag data (see --trashhelp) + --force-utf8 Rewrite each record so that they describe themselves as + UTF-8 encoded + --autoscrub -a Automatically remove non-numeric tags in data + --fix-subfield -fs Subfield code to use in place of non-alphanumeric + or empty subfield codes + --nocollapse -n Don't compress records to one line on output + --no-strip9 Don't autoremove 901/903 tags in data + --trashfile -t File containing trash tag data (see --trashhelp) + --fullauto No manual edits. All problematic records dumped to + exception file. - --script Store human-initiated ops in scriptfile (.mcscript) - Not yet implemented HELP exit; }