X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=marc_cleanup;h=58a68c163e0786ee2ef20ce154ce2de3f3b1e398;hp=7f228354632f69e576f3572502d2e666642e7775;hb=353d0b5b20f6ac7f7aea62c3fc195597deed594b;hpb=10808752247bab3066a1bdfb69926cd4b2928ff8 diff --git a/marc_cleanup b/marc_cleanup index 7f22835..58a68c1 100755 --- a/marc_cleanup +++ b/marc_cleanup @@ -1,4 +1,21 @@ #!/usr/bin/perl + +# Copyright 2009-2012, Equinox Software, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + require 5.10.0; use strict; @@ -16,23 +33,22 @@ binmode $OUT, ":utf8"; $| = 1; # initialization and setup -my $conf = {}; -initialize($conf); +my $c = {}; +initialize($c); # set up files, since everything appears to be in order -my $marcfile = shift || 'incoming.marc.xml'; -open MARC, '<:utf8', $marcfile +open MARC, '<:utf8', $c->{marcfile} or die "Can't open input file $!\n"; -open my $NUMARC, '>:utf8', $conf->{output} +open my $NUMARC, '>:utf8', $c->{output} or die "Can't open output file $!\n"; open my $OLD2NEW, '>', 'old2new.map' - if ($conf->{'renumber-from'} and $conf->{'original-tag'}); + if ($c->{'renumber-from'} and $c->{'original-tag'}); my $EXMARC = 'EX'; print $NUMARC "\n"; -$conf->{totalrecs} = `grep -c '{totalrecs}; -$conf->{percent} = 0; +$c->{totalrecs} = `grep -c '{marcfile}`; +chomp $c->{totalrecs}; +$c->{percent} = 0; my @record; # current record storage my %recmeta; # metadata about current record @@ -64,9 +80,9 @@ my @spinner = qw(- \\ | /); my $sidx = 0; while ( buildrecord() ) { - unless ($conf->{ricount} % 50) { - $conf->{percent} = int(($conf->{ricount} / $conf->{totalrecs}) * 100); - print "\rWorking (",$conf->{percent},"%) ", $spinner[$sidx]; + unless ($c->{ricount} % 50) { + $c->{percent} = int(($c->{ricount} / $c->{totalrecs}) * 100); + print "\rWorking (",$c->{percent},"%) ", $spinner[$sidx]; $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1; } @@ -76,8 +92,8 @@ while ( buildrecord() ) { $ptr = 0; until ($ptr == $#record) { # get datafield/tag data if we have it - $rc = stow_record_data() if ($conf->{'renumber-from'} and $conf->{'original-tag'}); - return $rc if $rc; + $rc = stow_record_data() if ($c->{'renumber-from'} and $c->{'original-tag'}); + next if $rc; # naked ampersands if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/) @@ -87,7 +103,7 @@ while ( buildrecord() ) { my $match = $1; # tags must be numeric if ($match =~ /\D/) { - edit("Non-numerics in tag") unless $conf->{autoscrub}; + edit("Non-numerics in tag") unless $c->{autoscrub}; next; } } @@ -95,11 +111,11 @@ while ( buildrecord() ) { # subfields can't be non-alphanumeric if ($record[$ptr] =~ /\n"; -print $OUT "\nDone. ",$conf->{ricount}," in; ",$conf->{rocount}," dumped \n"; +print $OUT "\nDone. ",$c->{ricount}," in; ",$c->{rocount}," dumped \n"; #----------------------------------------------------------------------------------- @@ -161,6 +177,11 @@ sub do_automated_cleanups { message("Short leader padded"); } } + if ($c->{'force-utf8'}) { + if ($record[$ptr] =~ m|(.........).(.+)|) { + $record[$ptr] = "$1a$2\n"; + } + } if ($record[$ptr] =~ m|(.+?)|) { + $record[$ptr] =~ s|\s{10,}||; + message("Trailing whitespace trimmed from subfield contents"); + } + # automatable subfield maladies - $record[$ptr] =~ s/code=" ">c/code="c">/; - $record[$ptr] =~ s/code=" ">\$/code="c">\$/; + if ($record[$ptr] =~ /code=" ">c/) { + message('Fixing probable subfield c, scenario 1'); + $record[$ptr] =~ s/code=" ">c/code="c">/; + } + if ($record[$ptr] =~ /code=" ">\$/) { + message('Fixing probable subfield c, scenario 2'); + $record[$ptr] =~ s/code=" ">\$/code="c">\$/; + } + + if ($c->{'fix-subfield'}) { + if ($record[$ptr] =~ /code="&">/) { + message('Fixing & for subfield code'); + $record[$ptr] =~ s/code="&">/code="$c->{'fix-subfield'}">/; + } + if ($record[$ptr] =~ /code="(.*?\P{IsAlnum}.*?)">/) { + message("Fixing non-alphanumeric subfield code: $1 -> " . $c->{'fix-subfield'}); + $record[$ptr] =~ s/code=".*?\P{IsAlnum}.*?">/code="$c->{'fix-subfield'}">/; + } + if ($record[$ptr] =~ /code="">/) { + message('Fixing null subfield code'); + $record[$ptr] =~ s/code="">/code="$c->{'fix-subfield'}">/; + } + } } return 0; } @@ -187,13 +235,13 @@ sub do_automated_cleanups { sub stow_record_data { # get tag data if we're looking at it my $tag = 0; - if ($record[$ptr] =~ m/<(control|data)field tag="(?.{3})"/) { - $recmeta{tag} = $+{TAG}; + if ($record[$ptr] =~ m/<(?:control|data)field tag="(.{3})"/) { + $recmeta{tag} = $1; $tag = $recmeta{tag}; - $record[$ptr] =~ m/ind1="(?.)"/; - $recmeta{ind1} = $+{IND1} || ''; - $record[$ptr] =~ m/ind2="(?.)"/; - $recmeta{ind2} = $+{IND2} || ''; + $record[$ptr] =~ m/ind1="(.)"/; + $recmeta{ind1} = $1 || ''; + $record[$ptr] =~ m/ind2="(.)"/; + $recmeta{ind2} = $1 || ''; unless ($tag) { message("Autokill record: no detectable tag"); @@ -202,16 +250,16 @@ sub stow_record_data { } # and since we are looking at a tag, see if it's the original id - if ($tag == $conf->{'original-tag'}) { + if ($tag == $c->{'original-tag'}) { my $oid = 0; if ($tag < 10) { # controlfield if ($record[$ptr] =~ m|(.+?)|) { $oid = $1; print $OLD2NEW "$oid\t", $recmeta{nid}, "\n" } - } elsif ($tag >= 10 and $conf->{'original-subfield'}) { + } elsif ($tag >= 10 and $c->{'original-subfield'}) { # datafield my $line = $record[$ptr]; my $lptr = $ptr; - my $osub = $conf->{'original-subfield'}; + my $osub = $c->{'original-subfield'}; # skim to end of this tag until ($line =~ m||) { if ($line =~ /(.+?)new map file - if ($conf->{'renumber-from'} and $conf->{'original-subfield'}) { + if ($c->{'renumber-from'} and $c->{'original-subfield'}) { } } @@ -252,11 +300,11 @@ Handles the Term::ReadLine loop sub edit { my ($msg) = @_; - return if $conf->{trash}->has( $recmeta{tag} ); - if ( $conf->{fullauto} ) + return if $c->{trash}->has( $recmeta{tag} ); + if ( $c->{fullauto} ) { dump_record($msg); return } - $conf->{editmsg} = $msg; + $c->{editmsg} = $msg; print_fullcontext(); # stow original problem line @@ -298,11 +346,11 @@ to the driver loop. sub buildrecord { my $l = ''; my $istrash = 0; - my $trash = $conf->{trash}; + my $trash = $c->{trash}; - $l = while (defined $l and $l !~ //); + $l = while (defined $l and $l !~ /{ricount}++; + $c->{ricount}++; for (keys %recmeta) { $recmeta{$_} = undef } for (0 .. @record) { delete $record[$_] } @@ -322,7 +370,7 @@ sub buildrecord { next; } if ($l =~ m/has($1) or ($conf->{autoscrub} and $1 =~ /\D/)) + if ($trash->has($1) or ($c->{autoscrub} and $1 =~ /\D/)) { $istrash = 1; next } } @@ -332,15 +380,15 @@ sub buildrecord { } # add 903(?) with new record id - if ($conf->{'renumber-from'}) { - $recmeta{nid} = $conf->{'renumber-from'}; - push @record, join('', ' {'renumber-tag'}, '" ind1=" " ind2=" "> ', $recmeta{nid}, "\n"); - $conf->{'renumber-from'}++; + $c->{'renumber-from'}++; } $i++; @@ -353,7 +401,7 @@ sub write_record { if ($FH eq 'EX') { $EXMARC = undef; - open $EXMARC, '>:utf8', $conf->{exception} + open $EXMARC, '>:utf8', $c->{exception} or die "Can't open exception file $!\n"; $FH = $EXMARC; } @@ -362,14 +410,14 @@ sub write_record { if(defined $recmeta{explanation}); # scrub newlines (unless told not to or writing exception record) - unless ($conf->{nocollapse} or $FH eq $EXMARC) + unless ($c->{nocollapse} or $FH eq $EXMARC) { s/\n// for (@record) } # actually write the record print $FH @record,"\n"; # increment output record count (if not exception) - $conf->{rocount}++ if ($FH eq $EXMARC); + $c->{rocount}++ if ($FH eq $EXMARC); # if we were dumping to exception file, nuke the record and set ptr # to terminate processing loop @@ -379,17 +427,17 @@ sub write_record { sub print_fullcontext { print $OUT "\r", ' ' x 72, "\n"; - print $OUT $conf->{editmsg},"\n"; + print $OUT $c->{editmsg},"\n"; print $OUT "\r Tag:",$recmeta{tag}, " Ind1:'", $recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'"; - print $OUT " @ ", $conf->{ricount}, "/", $conf->{totalrecs}; + print $OUT " @ ", $c->{ricount}, "/", $c->{totalrecs}; print_context(); return 0; } sub print_context { - my $upper = int($conf->{window} / 2) + 1; - my $lower = int($conf->{window} / 2) - 1; + my $upper = int($c->{window} / 2) + 1; + my $lower = int($c->{window} / 2) - 1; my $start = ($ptr - $upper < 0) ? 0 : $ptr - $upper; my $stop = ($ptr + $lower > $#record) ? $#record : $ptr + $lower; print $OUT "\n"; @@ -402,7 +450,7 @@ sub print_context { sub message { my ($msg) = @_; - print $OUT "\r$msg at ",$conf->{ricount},"/",$conf->{totalrecs}, "\n"; + print $OUT "\r$msg at ",$c->{ricount},"/",$c->{totalrecs}, "\n"; } #----------------------------------------------------------------------------------- @@ -484,7 +532,7 @@ sub yank_line { { print $OUT "No killed line to yank\n"; return } my @a = @record[0 .. $ptr - 1]; my @b = @record[$ptr .. $#record]; - @record = (@a, $conf->{killline}, @b); + @record = (@a, $c->{killline}, @b); @a = undef; @b = undef; print_context(); return 0; @@ -527,16 +575,16 @@ sub prev_line { sub commit_edit { return 1 } sub widen_window { - if ($conf->{window} == 15) + if ($c->{window} == 15) { print $OUT "Window can't be bigger than 15 lines\n"; return } - $conf->{window} += 2; + $c->{window} += 2; print_context; } sub narrow_window { - if ($conf->{window} == 5) + if ($c->{window} == 5) { print $OUT "Window can't be smaller than 5 lines\n"; return } - $conf->{window} -= 2; + $c->{window} -= 2; print_context; } @@ -585,6 +633,7 @@ sub initialize { 'fullauto', 'exception|x=s', 'output|o=s', + 'marcfile|m=s', 'prefix|p=s', 'nocollapse|n', 'renumber-from|rf=i', @@ -592,28 +641,37 @@ sub initialize { 'renumber-subfield|rs=s', 'original-tag|ot=i', 'original-subfield|os=s', + 'fix-subfield|fs=s', + 'force-utf8', 'script', 'no-strip9', 'trashfile|t=s', 'trashhelp', 'help|h', ); - show_help() unless $rc and @ARGV; + show_help() unless $rc; show_help() if ($c->{help}); show_trashhelp() if ($c->{trashhelp}); # defaults - my $pfx = $c->{prefix} // "bibs"; + my $pfx = defined($c->{prefix}) ? $c->{prefix} : "bibs"; $c->{ricount} = 0; $c->{rocount} = 0; - $c->{output} = join('.',$c->{prefix},'clean','marc','xml'); - $c->{exception} = join('.',$c->{prefix},'exception','marc','xml'); $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'}; $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'}; $c->{window} = 9; + if ($c->{prefix}) { + $c->{output} = join('.',$c->{prefix},'clean','marc','xml') + unless $c->{output}; + $c->{exception} = join('.',$c->{prefix},'exception','marc','xml') + unless $c->{exception}; + $c->{marcfile} = $c->{prefix} . '.marc.xml' + unless $c->{marcfile}; + } + show_help() unless ($c->{marcfile} and $c->{output}); if ($c->{trashfile}) { - $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $conf->{trashfile}) + $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $c->{trashfile}) } else { $c->{trash} = Equinox::Migration::SimpleTagList->new; } @@ -625,21 +683,20 @@ sub initialize { # remove original id sequence tag from trash hash if we know it $c->{trash}->remove_tag($c->{'original-tag'}) if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) ); - - my @keys = keys %{$c}; - show_help() unless (@ARGV and @keys); } sub show_help { print < +Usage is: marc_cleanup [OPTIONS] Options --output -o Cleaned MARCXML output filename --exception -x Exception (dumped records) MARCXML filename or - --prefix=> -p Shared prefix for output/exception files. Will produce + --prefix= -p Shared prefix for output/exception files. Will produce PREFIX.clean.marc.xml and PREFIX.exception.marc.xml + --marcfile -m Input filename. Defaults to PREFIX.marc.xml + --renumber-from -rf Begin renumbering id sequence with this number --renumber-tag -rt Tag to use in renumbering (default: 903) --renumber-subfield -rs Subfield code to use in renumbering (default: a) @@ -649,16 +706,18 @@ Options and renumbering is in effect, an old-to-new mapping file (old2new.map) will be generated. - --autoscrub -a Automatically remove non-numeric tags in data - --nocollapse -n Don't compress records to one line on output - --no-strip9 Don't autoremove 901/903 tags in data - --trashfile -t File containing trash tag data (see --trashhelp) + --force-utf8 Rewrite each record so that they describe themselves as + UTF-8 encoded + --autoscrub -a Automatically remove non-numeric tags in data + --fix-subfield -fs Subfield code to use in place of non-alphanumeric + or empty subfield codes + --nocollapse -n Don't compress records to one line on output + --no-strip9 Don't autoremove 901/903 tags in data + --trashfile -t File containing trash tag data (see --trashhelp) - --fullauto No manual edits. All problematic records dumped to - exception file. + --fullauto No manual edits. All problematic records dumped to + exception file. - --script Store human-initiated ops in scriptfile (.mcscript) - Not yet implemented HELP exit; }