X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=marc_cleanup;h=e0c20a781afd6f13c10402f017df98f913e9e8cf;hp=f40babb82cedbe616f11141ef1010cffa082e0c3;hb=60596cd805e898262344ba67da933cbc31e052e9;hpb=9edd65e09a65fb042313081490a79f7e5ef33406
diff --git a/marc_cleanup b/marc_cleanup
index f40babb..e0c20a7 100755
--- a/marc_cleanup
+++ b/marc_cleanup
@@ -1,34 +1,58 @@
#!/usr/bin/perl
+# Copyright 2009-2012, Equinox Software, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+require 5.10.0;
+
use strict;
use warnings;
use Getopt::Long;
use Term::ReadLine;
+use Equinox::Migration::SimpleTagList;
my $term = new Term::ReadLine 'yaz-cleanup';
my $OUT = $term->OUT || \*STDOUT;
+binmode STDOUT, ":utf8";
+binmode $OUT, ":utf8";
$| = 1;
# initialization and setup
-my $conf = {};
-initialize($conf);
-populate_trash() if ($conf->{trashfile});
+my $c = {};
+initialize($c);
# set up files, since everything appears to be in order
-open MARC, '<:utf8', (shift || 'incoming.marc.xml')
+open MARC, '<:utf8', $c->{marcfile}
or die "Can't open input file $!\n";
-open my $NUMARC, '>:utf8', $conf->{output}
+open my $NUMARC, '>:utf8', $c->{output}
or die "Can't open output file $!\n";
open my $OLD2NEW, '>', 'old2new.map'
- if ($conf->{'renumber-from'} and $conf->{'original-subfield'});
+ if ($c->{'renumber-from'} and $c->{'original-tag'});
my $EXMARC = 'EX';
+print $NUMARC "\n";
+$c->{totalrecs} = `grep -c '{marcfile}`;
+chomp $c->{totalrecs};
+$c->{percent} = 0;
-my @record = (); # current record storage
-my %recmeta = (); # metadata about current record
-my $ptr = 0; # record index pointer
+my @record; # current record storage
+my %recmeta; # metadata about current record
+my $ptr = 0; # record index pointer
# this is the dispatch table which drives command selection in
# edit(), below
@@ -52,19 +76,25 @@ my %commands = ( c => \&print_fullcontext,
help => \&help,
);
-my @spinner = qw(- / | \\);
+my @spinner = qw(- \\ | /);
my $sidx = 0;
while ( buildrecord() ) {
- unless ($conf->{ricount} % 100) {
- print "\rWorking... ", $spinner[$sidx];
+ unless ($c->{ricount} % 50) {
+ $c->{percent} = int(($c->{ricount} / $c->{totalrecs}) * 100);
+ print "\rWorking (",$c->{percent},"%) ", $spinner[$sidx];
$sidx = ($sidx == $#spinner) ? 0 : $sidx + 1;
}
- do_automated_cleanups();
+ my $rc = do_automated_cleanups();
+ next if $rc;
$ptr = 0;
until ($ptr == $#record) {
+ # get datafield/tag data if we have it
+ $rc = stow_record_data() if ($c->{'renumber-from'} and $c->{'original-tag'});
+ next if $rc;
+
# naked ampersands
if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/)
{ edit("Naked ampersand"); $ptr= 0; next }
@@ -73,33 +103,30 @@ while ( buildrecord() ) {
my $match = $1;
# tags must be numeric
if ($match =~ /\D/) {
- edit("Non-numerics in tag") unless $conf->{autoscrub};
+ edit("Non-numerics in tag") unless $c->{autoscrub};
next;
}
- # test for existing 901/903 unless we're autocleaning them
- unless ($conf->{'strip-nines'}) {
- if ($match == 901 or $match == 903) {
- edit("Incoming 901/903 found in data");
- next;
- }
- }
}
# subfields can't be non-alphanumeric
if ($record[$ptr] =~ /\n";
-print $OUT "\nDone. \n";
+print $NUMARC "\n";
+print $OUT "\nDone. ",$c->{ricount}," in; ",$c->{rocount}," dumped \n";
#-----------------------------------------------------------------------------------
@@ -109,8 +136,6 @@ print $OUT "\nDone. \n";
sub do_automated_cleanups {
$ptr = 0;
until ($ptr == $#record) {
- # get datafield/tag data if we have it
- stow_record_data();
# catch empty datafield elements
if ($record[$ptr] =~ m/|) {
+ $record[$ptr] =~ s|\s{10,}||;
+ message("Trailing whitespace trimmed from subfield contents");
+ }
# automatable subfield maladies
$record[$ptr] =~ s/code=" ">c/code="c">/;
- $record[$ptr] =~ s/code=" ">\$/code="c"$>/;
+ $record[$ptr] =~ s/code=" ">\$/code="c">\$/;
}
+ return 0;
}
sub stow_record_data {
# get tag data if we're looking at it
-
- if ($record[$ptr] =~ m/.)"/;
- $recmeta{ind1} = $+{IND1} || '';
- $record[$ptr] =~ m/ind2="(?.)"/;
- $recmeta{ind2} = $+{IND2} || '';
-
- unless (defined $recmeta{tag}) {
+ my $tag = 0;
+ if ($record[$ptr] =~ m/<(?:control|data)field tag="(.{3})"/) {
+ $recmeta{tag} = $1;
+ $tag = $recmeta{tag};
+ $record[$ptr] =~ m/ind1="(.)"/;
+ $recmeta{ind1} = $1 || '';
+ $record[$ptr] =~ m/ind2="(.)"/;
+ $recmeta{ind2} = $1 || '';
+
+ unless ($tag) {
message("Autokill record: no detectable tag");
dump_record("No detectable tag") ;
+ return 1;
}
# and since we are looking at a tag, see if it's the original id
- if ($conf->{'original-subfield'} and
- $recmeta{tag} == $conf->{'original-tag'}) {
- my $line = $record[$ptr]; my $lptr = $ptr;
- my $osub = $conf->{'original-subfield'};
- $recmeta{oid} = 'NONE';
-
- until ($line =~ m||) {
- $lptr++;
- $line = $record[$lptr];
- $recmeta{oid} = $+{TAG}
- if ($line =~ /(.+?));
+ if ($tag == $c->{'original-tag'}) {
+ my $oid = 0;
+ if ($tag < 10) {
+ # controlfield
+ if ($record[$ptr] =~ m|(.+?)|)
+ { $oid = $1; print $OLD2NEW "$oid\t", $recmeta{nid}, "\n" }
+ } elsif ($tag >= 10 and $c->{'original-subfield'}) {
+ # datafield
+ my $line = $record[$ptr]; my $lptr = $ptr;
+ my $osub = $c->{'original-subfield'};
+ # skim to end of this tag
+ until ($line =~ m||) {
+ if ($line =~ /(.+?))
+ { $oid = $1; print $OLD2NEW "$oid\t", $recmeta{nid}, "\n" }
+ $lptr++;
+ $line = $record[$lptr];
+ }
+ } else {
+ return 0;
+ }
+
+ # didn't find the old id!
+ unless ($oid) {
+ message("Autokill record: no oldid when old2new mapping requested");
+ dump_record("No old id found");
+ return 1;
}
+
+ # got it; write to old->new map file
+ if ($c->{'renumber-from'} and $c->{'original-subfield'}) {
+ }
+
}
}
+ return 0;
}
#-----------------------------------------------------------------------------------
@@ -223,8 +274,11 @@ Handles the Term::ReadLine loop
sub edit {
my ($msg) = @_;
- return if $conf->{trash}{ $recmeta{tag} };
- $conf->{editmsg} = $msg;
+ return if $c->{trash}->has( $recmeta{tag} );
+ if ( $c->{fullauto} )
+ { dump_record($msg); return }
+
+ $c->{editmsg} = $msg;
print_fullcontext();
# stow original problem line
@@ -265,85 +319,80 @@ to the driver loop.
sub buildrecord {
my $l = '';
- $l = while (defined $l and $l !~ //);
+ my $istrash = 0;
+ my $trash = $c->{trash};
+
+ $l = while (defined $l and $l !~ /{ricount}++;
+ $c->{ricount}++;
+
+ for (keys %recmeta) { $recmeta{$_} = undef }
+ for (0 .. @record) { delete $record[$_] }
+
+ my $i = 0;
+ until ($l =~ m||) {
+ # clean up tags with spaces in them
+ $l =~ s/tag=" /tag="00/g;
+ $l =~ s/tag=" /tag="0/g;
+ $l =~ s/tag="-/tag="0/g;
+ $l =~ s/tag="(\d\d) /tag="0$1/g;
+
+ # excise unwanted tags
+ if ($istrash) {
+ $istrash = 0 if ($l =~ m|;
+ next;
+ }
+ if ($l =~ m/has($1) or ($c->{autoscrub} and $1 =~ /\D/))
+ { $istrash = 1; next }
+ }
+
+ push @record, $l;
+ $l = ;
+ $i++;
+ }
+
+ # add 903(?) with new record id
+ if ($c->{'renumber-from'}) {
+ $recmeta{nid} = $c->{'renumber-from'};
+ push @record, join('', ' ',
+ $recmeta{nid},
+ "\n");
+ $c->{'renumber-from'}++;
+ }
+ $i++;
- until ($l =~ m||)
- { push @record, $l; $l = ; }
push @record, $l;
return 1;
}
sub write_record {
my ($FH) = @_;
- my $trash = $conf->{trash};
if ($FH eq 'EX') {
$EXMARC = undef;
- open $EXMARC, '>:utf8', $conf->{exception}
+ open $EXMARC, '>:utf8', $c->{exception}
or die "Can't open exception file $!\n";
$FH = $EXMARC;
}
- $conf->{rocount}++ if ($FH eq $NUMARC);
print $FH '\n"
if(defined $recmeta{explanation});
- # excise unwanted tags
- if (keys %{$trash} or $conf->{autoscrub}) {
- my @trimmed = ();
- my $istrash = 0;
- for my $line (@record) {
- if ($istrash) {
- $istrash = 0 if $line =~ m|{$tag} or ($conf->{autoscrub} and $tag =~ /\D/)) {
- $istrash = 1;
- next
- }
- }
- push @trimmed, $line;
- }
- @record = @trimmed;
- }
-
- # add 903(?) with new record id
- my $renumber = '';
- if ($conf->{'renumber-from'}) {
- $recmeta{nid} = $conf->{'renumber-from'};
- $renumber = join('', ' ', $recmeta{nid}, "\n");
- my @tmp = @record[0 .. $#record - 1];
- my $last = $record[$#record];
- @record = (@tmp, $renumber, $last);
- $conf->{'renumber-from'}++;
- }
-
# scrub newlines (unless told not to or writing exception record)
- unless ($conf->{nocollapse} or $FH eq $EXMARC)
+ unless ($c->{nocollapse} or $FH eq $EXMARC)
{ s/\n// for (@record) }
- # write to old->new map file if needed
- if ($conf->{'renumber-from'} and $conf->{'original-subfield'}) {
- unless (defined $recmeta{oid}) {
- my $msg = join(' ', "No old id num found");
- dump_record($msg);
- } else {
- print $OLD2NEW $recmeta{oid}, "\t", $recmeta{nid}, "\n"
- }
- }
-
# actually write the record
print $FH @record,"\n";
+ # increment output record count (if not exception)
+ $c->{rocount}++ if ($FH eq $EXMARC);
+
# if we were dumping to exception file, nuke the record and set ptr
# to terminate processing loop
@record = ('a');
@@ -352,17 +401,17 @@ sub write_record {
sub print_fullcontext {
print $OUT "\r", ' ' x 72, "\n";
- print $OUT $conf->{editmsg},"\n";
+ print $OUT $c->{editmsg},"\n";
print $OUT "\r Tag:",$recmeta{tag}, " Ind1:'",
$recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'";
- print $OUT " @ ", $conf->{ricount}, "/", $conf->{rocount} + 1;
+ print $OUT " @ ", $c->{ricount}, "/", $c->{totalrecs};
print_context();
return 0;
}
sub print_context {
- my $upper = int($conf->{window} / 2) + 1;
- my $lower = int($conf->{window} / 2) - 1;
+ my $upper = int($c->{window} / 2) + 1;
+ my $lower = int($c->{window} / 2) - 1;
my $start = ($ptr - $upper < 0) ? 0 : $ptr - $upper;
my $stop = ($ptr + $lower > $#record) ? $#record : $ptr + $lower;
print $OUT "\n";
@@ -375,7 +424,7 @@ sub print_context {
sub message {
my ($msg) = @_;
- print $OUT "\r$msg at ",$conf->{ricount},"/",$conf->{rocount} + 1,"\n";
+ print $OUT "\r$msg at ",$c->{ricount},"/",$c->{totalrecs}, "\n";
}
#-----------------------------------------------------------------------------------
@@ -406,6 +455,7 @@ sub substitute {
$recmeta{prevline} = $record[$ptr];
$record[$ptr] =~ s/$from/$to/;
+ $ofrom = undef; $to = undef; $from = undef;
print_context();
return 0;
}
@@ -436,6 +486,7 @@ sub flip_line {
my $temp = $record[$ptr];
$record[$ptr] = $recmeta{prevline};
$recmeta{prevline} = $temp;
+ $temp = undef;
print_context();
return 0;
}
@@ -445,6 +496,7 @@ sub kill_line {
my @a = @record[0 .. $ptr - 1];
my @b = @record[$ptr + 1 .. $#record];
@record = (@a, @b);
+ @a = undef; @b = undef;
print_context();
return 0;
}
@@ -454,7 +506,8 @@ sub yank_line {
{ print $OUT "No killed line to yank\n"; return }
my @a = @record[0 .. $ptr - 1];
my @b = @record[$ptr .. $#record];
- @record = (@a, $conf->{killline}, @b);
+ @record = (@a, $c->{killline}, @b);
+ @a = undef; @b = undef;
print_context();
return 0;
}
@@ -475,8 +528,8 @@ sub display_lines {
sub dump_record {
my (@explanation) = @_;
- print $OUT @explanation;
- $recmeta{explanation} = join(' ', 'Tag', $recmeta{tag}, @explanation);
+ $recmeta{explanation} = join(' ', 'DUMPING RECORD: Tag', $recmeta{tag}, @explanation);
+ message( $recmeta{explanation} );
write_record($EXMARC);
return 1;
}
@@ -496,16 +549,16 @@ sub prev_line {
sub commit_edit { return 1 }
sub widen_window {
- if ($conf->{window} == 15)
+ if ($c->{window} == 15)
{ print $OUT "Window can't be bigger than 15 lines\n"; return }
- $conf->{window} += 2;
+ $c->{window} += 2;
print_context;
}
sub narrow_window {
- if ($conf->{window} == 5)
+ if ($c->{window} == 5)
{ print $OUT "Window can't be smaller than 5 lines\n"; return }
- $conf->{window} -= 2;
+ $c->{window} -= 2;
print_context;
}
@@ -533,104 +586,6 @@ return 0;
sub quit { exit }
-#-----------------------------------------------------------------------------------
-# populate_trash
-#-----------------------------------------------------------------------------------
-# defined a domain-specific language for specifying MARC tags to be dropped from
-# records during processing. it is line oriented, and is specified as follows:
-#
-# each line may specify any number of tags to be included, either singly (\d{1,3})
-# or as a range (\d{1,3}\.\.\d{1,3}
-#
-# if a single number is given, it must be between '000' and '999', inclusive.
-#
-# ranges obey the previous rule, and also the first number of the range must be less
-# than the second number
-#
-# finally, any single range in a line may be followed by the keyword 'except'. every
-# number or range after 'except' is excluded from the range specified. all these
-# numbers must actually be within the range.
-#
-# specifying a tag twice is an error, to help prevent typos
-
-sub populate_trash {
- print $OUT ">>> TRASHTAGS FILE FOUND. LOADING TAGS TO BE STRIPPED FROM OUTPUT\n";
- open TRASH, '<', $conf->{trashfile}
- or die "Can't open trash tags file!\n";
- while () {
- my $lastwasrange = 0;
- my %lastrange = ( high => 0, low => 0);
- my $except = 0;
-
- my @chunks = split /\s+/;
- while (my $chunk = shift @chunks) {
-
- # single values
- if ($chunk =~ /^\d{1,3}$/) {
- trash_add($chunk, $except);
- $lastwasrange = 0;
- next;
- }
-
- # ranges
- if ($chunk =~ /^\d{1,3}\.\.\d{1,3}$/) {
- my ($low, $high) = trash_add_range($chunk, $except, \%lastrange);
- $lastwasrange = 1;
- %lastrange = (low => $low, high => $high)
- unless $except;
- next;
- }
-
- # 'except'
- if ($chunk eq 'except') {
- die "Keyword 'except' can only follow a range (line $.)\n"
- unless $lastwasrange;
- die "Keyword 'except' may only occur once per line (line $.)\n"
- if $except;
- $except = 1;
- next;
- }
-
- die "Unknown chunk $chunk in .trashtags file (line $.)\n";
- }
- }
-
- # remove original id sequence tag from trash hash if we know it
- trash_add($conf->{'original-tag'}, 1)
- if ($conf->{'original-tag'} and $conf->{trash}{ $conf->{'original-tag'} });
-}
-
-sub trash_add_range {
- my ($chunk, $except, $range) = @_;
- my ($low,$high) = split /\.\./, $chunk;
- die "Ranges must be 'low..high' ($low is greater than $high on line $.)\n"
- if ($low > $high);
- if ($except) {
- die "Exception ranges must be within last addition range (line $.)\n"
- if ($low < $range->{low} or $high > $range->{high});
- }
- for my $tag ($low..$high) {
- trash_add($tag, $except)
- }
- return $low, $high;
-}
-
-sub trash_add {
- my ($tag, $except) = @_;
- my $trash = $conf->{trash};
-
- die "Trash values must be valid tags (000-999)\n"
- unless ($tag >= 0 and $tag <= 999);
-
- if ($except) {
- delete $trash->{$tag};
- } else {
- die "Trash tag '$tag' specified twice (line $.)\n"
- if $trash->{$tag};
- $trash->{$tag} = 1;
- }
-}
-
#-----------------------------------------------------------------------
=head2 initialize
@@ -649,8 +604,10 @@ sub initialize {
my $rc = GetOptions( $c,
'autoscrub|a',
+ 'fullauto',
'exception|x=s',
'output|o=s',
+ 'marcfile|m=s',
'prefix|p=s',
'nocollapse|n',
'renumber-from|rf=i',
@@ -659,7 +616,7 @@ sub initialize {
'original-tag|ot=i',
'original-subfield|os=s',
'script',
- 'strip-nines',
+ 'no-strip9',
'trashfile|t=s',
'trashhelp',
'help|h',
@@ -669,35 +626,48 @@ sub initialize {
show_trashhelp() if ($c->{trashhelp});
# defaults
- if ($c->{prefix}) {
- $c->{output} = join('.',$c->{prefix},'marc','xml');
- $c->{exception} = join('.',$c->{prefix},'marc','ex');
- }
+ my $pfx = defined($c->{prefix}) ? $c->{prefix} : "bibs";
+ $c->{ricount} = 0;
+ $c->{rocount} = 0;
$c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'};
$c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'};
- $c->{window} = 5;
-
- # autotrash 901, 903 if strip-nines
- if ($c->{'strip-nines'}) {
- $c->{trash}{901} = 1;
- $c->{trash}{903} = 1;
+ $c->{window} = 9;
+ if ($c->{prefix}) {
+ $c->{output} = join('.',$c->{prefix},'clean','marc','xml')
+ unless $c->{output};
+ $c->{exception} = join('.',$c->{prefix},'exception','marc','xml')
+ unless $c->{exception};
+ $c->{marcfile} = $c->{prefix} . '.marc.xml'
+ unless $c->{marcfile};
}
+ show_help() unless ($c->{marcfile} and $c->{output});
- my @keys = keys %{$c};
- show_help() unless (@ARGV and @keys);
+ if ($c->{trashfile}) {
+ $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $c->{trashfile})
+ } else {
+ $c->{trash} = Equinox::Migration::SimpleTagList->new;
+ }
+ # autotrash 901, 903 unless no strip-nines
+ unless ($c->{'no-strip9'}) {
+ $c->{trash}->add_tag(901);
+ $c->{trash}->add_tag(903);
+ }
+ # remove original id sequence tag from trash hash if we know it
+ $c->{trash}->remove_tag($c->{'original-tag'})
+ if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) );
}
sub show_help {
print <
+Usage is: marc_cleanup [OPTIONS]
Options
--output -o Cleaned MARCXML output filename
--exception -x Exception (dumped records) MARCXML filename
or
- --prefix=> -p Shared prefix for output/exception files. Will
- produce PREFIX.marc.xml and PREFIX.ex.xml
+ --prefix= -p Shared prefix for output/exception files. Will produce
+ PREFIX.clean.marc.xml and PREFIX.exception.marc.xml
- --trashfile -t File containing trash tag data (see --trashhelp)
+ --marcfile -m Input filename. Defaults to PREFIX.marc.xml
--renumber-from -rf Begin renumbering id sequence with this number
--renumber-tag -rt Tag to use in renumbering (default: 903)
@@ -708,44 +678,25 @@ Options
and renumbering is in effect, an old-to-new mapping
file (old2new.map) will be generated.
- --nocollapse -n Don't compress records to one line on output
--autoscrub -a Automatically remove non-numeric tags in data
- --strip-nines Automatically remove any existing 901/903 tags in data
+ --nocollapse -n Don't compress records to one line on output
+ --no-strip9 Don't autoremove 901/903 tags in data
+ --trashfile -t File containing trash tag data (see --trashhelp)
+
+ --fullauto No manual edits. All problematic records dumped to
+ exception file.
- --script Store human-initiated ops in scriptfile (.mcscript)
- Not yet implemented
HELP
exit;
}
sub show_trashhelp {
print <