X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=marc_cleanup;h=9bc7b9665141243032517de37a724a6992fcc47b;hp=7d09f2738e656b287eb2d84b5e16c738400867f7;hb=602cb61a688e368aad047b1da9516e06ae76f858;hpb=4b9ee7a2265d1c7e9478f609e7715f1d49077350
diff --git a/marc_cleanup b/marc_cleanup
index 7d09f27..9bc7b96 100755
--- a/marc_cleanup
+++ b/marc_cleanup
@@ -6,7 +6,9 @@ use warnings;
use Getopt::Long;
use Term::ReadLine;
+use Equinox::Migration::SimpleTagList;
+binmode STDOUT, ":utf8";
my $term = new Term::ReadLine 'yaz-cleanup';
my $OUT = $term->OUT || \*STDOUT;
@@ -15,17 +17,21 @@ $| = 1;
# initialization and setup
my $conf = {};
initialize($conf);
-populate_trash() if ($conf->{trashfile});
# set up files, since everything appears to be in order
-open MARC, '<:utf8', (shift || 'incoming.marc.xml')
+my $marcfile = shift || 'incoming.marc.xml';
+open MARC, '<:utf8', $marcfile
or die "Can't open input file $!\n";
open my $NUMARC, '>:utf8', $conf->{output}
or die "Can't open output file $!\n";
open my $OLD2NEW, '>', 'old2new.map'
if ($conf->{'renumber-from'} and $conf->{'original-subfield'});
my $EXMARC = 'EX';
+print $NUMARC "\n";
+$conf->{totalrecs} = `grep -c '{totalrecs};
+$conf->{percent} = 0;
my @record = (); # current record storage
my %recmeta = (); # metadata about current record
@@ -53,19 +59,25 @@ my %commands = ( c => \&print_fullcontext,
help => \&help,
);
-my @spinner = qw(- / | \\);
+my @spinner = qw(- \\ | /);
my $sidx = 0;
while ( buildrecord() ) {
- unless ($conf->{ricount} % 100) {
- print "\rWorking... ", $spinner[$sidx];
+ unless ($conf->{ricount} % 50) {
+ $conf->{percent} = int(($conf->{ricount} / $conf->{totalrecs}) * 100);
+ print "\rWorking (",$conf->{percent},"%) ", $spinner[$sidx];
$sidx = ($sidx == $#spinner) ? 0 : $sidx + 1;
}
- do_automated_cleanups();
+ my $rc = do_automated_cleanups();
+ next if $rc;
$ptr = 0;
until ($ptr == $#record) {
+ # get datafield/tag data if we have it
+ my $rc = stow_record_data();
+ return $rc if $rc;
+
# naked ampersands
if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/)
{ edit("Naked ampersand"); $ptr= 0; next }
@@ -78,7 +90,7 @@ while ( buildrecord() ) {
next;
}
# test for existing 901/903 unless we're autocleaning them
- unless ($conf->{'strip-nines'}) {
+ unless ($conf->{'strip9'} or $conf->{'no-strip9'}) {
if ($match == 901 or $match == 903) {
edit("Incoming 901/903 found in data");
next;
@@ -88,18 +100,22 @@ while ( buildrecord() ) {
# subfields can't be non-alphanumeric
if ($record[$ptr] =~ /\n";
+print $NUMARC "\n";
print $OUT "\nDone. \n";
@@ -110,9 +126,6 @@ print $OUT "\nDone. \n";
sub do_automated_cleanups {
$ptr = 0;
until ($ptr == $#record) {
- # get datafield/tag data if we have it
- stow_record_data();
-
# catch empty datafield elements
if ($record[$ptr] =~ m/|) {
@@ -177,38 +190,44 @@ sub do_automated_cleanups {
$record[$ptr] =~ s/code=" ">c/code="c">/;
$record[$ptr] =~ s/code=" ">\$/code="c">\$/;
}
+ return 0;
}
sub stow_record_data {
# get tag data if we're looking at it
-
if ($record[$ptr] =~ m/.)"/;
$recmeta{ind1} = $+{IND1} || '';
$record[$ptr] =~ m/ind2="(?.)"/;
$recmeta{ind2} = $+{IND2} || '';
-
+
unless (defined $recmeta{tag}) {
message("Autokill record: no detectable tag");
dump_record("No detectable tag") ;
+ return 1;
}
# and since we are looking at a tag, see if it's the original id
- if ($conf->{'original-subfield'} and
- $recmeta{tag} == $conf->{'original-tag'}) {
+ if ($conf->{'original-subfield'} and $recmeta{tag} == $conf->{'original-tag'}) {
my $line = $record[$ptr]; my $lptr = $ptr;
my $osub = $conf->{'original-subfield'};
$recmeta{oid} = 'NONE';
until ($line =~ m||) {
+ if ($line =~ /(.+?))
+ { $recmeta{oid} = $1 }
$lptr++;
$line = $record[$lptr];
- $recmeta{oid} = $+{TAG}
- if ($line =~ /(.+?));
+ }
+ unless (defined $recmeta{oid}) {
+ message("Autokill record: no oldid when old2new mapping requested");
+ dump_record("No old id found");
+ return 1;
}
}
}
+ return 0;
}
#-----------------------------------------------------------------------------------
@@ -224,7 +243,7 @@ Handles the Term::ReadLine loop
sub edit {
my ($msg) = @_;
- return if $conf->{trash}{ $recmeta{tag} };
+ return if $conf->{trash}->has( $recmeta{tag} );
$conf->{editmsg} = $msg;
print_fullcontext();
@@ -272,7 +291,7 @@ sub buildrecord {
%recmeta = ();
$conf->{ricount}++;
- until ($l =~ m||)
+ until ($l =~ m||)
{ push @record, $l; $l = ; }
push @record, $l;
return 1;
@@ -294,7 +313,7 @@ sub write_record {
if(defined $recmeta{explanation});
# excise unwanted tags
- if (keys %{$trash} or $conf->{autoscrub}) {
+ if (defined $trash or $conf->{autoscrub}) {
my @trimmed = ();
my $istrash = 0;
for my $line (@record) {
@@ -304,7 +323,7 @@ sub write_record {
}
if ($line =~ m/{$tag} or ($conf->{autoscrub} and $tag =~ /\D/)) {
+ if ($trash->has($tag) or ($conf->{autoscrub} and $tag =~ /\D/)) {
$istrash = 1;
next
}
@@ -334,12 +353,7 @@ sub write_record {
# write to old->new map file if needed
if ($conf->{'renumber-from'} and $conf->{'original-subfield'}) {
- unless (defined $recmeta{oid}) {
- my $msg = join(' ', "No old id num found");
- dump_record($msg);
- } else {
- print $OLD2NEW $recmeta{oid}, "\t", $recmeta{nid}, "\n"
- }
+ print $OLD2NEW $recmeta{oid}, "\t", $recmeta{nid}, "\n"
}
# actually write the record
@@ -534,104 +548,6 @@ return 0;
sub quit { exit }
-#-----------------------------------------------------------------------------------
-# populate_trash
-#-----------------------------------------------------------------------------------
-# defined a domain-specific language for specifying MARC tags to be dropped from
-# records during processing. it is line oriented, and is specified as follows:
-#
-# each line may specify any number of tags to be included, either singly (\d{1,3})
-# or as a range (\d{1,3}\.\.\d{1,3}
-#
-# if a single number is given, it must be between '000' and '999', inclusive.
-#
-# ranges obey the previous rule, and also the first number of the range must be less
-# than the second number
-#
-# finally, any single range in a line may be followed by the keyword 'except'. every
-# number or range after 'except' is excluded from the range specified. all these
-# numbers must actually be within the range.
-#
-# specifying a tag twice is an error, to help prevent typos
-
-sub populate_trash {
- print $OUT ">>> TRASHTAGS FILE FOUND. LOADING TAGS TO BE STRIPPED FROM OUTPUT\n";
- open TRASH, '<', $conf->{trashfile}
- or die "Can't open trash tags file!\n";
- while () {
- my $lastwasrange = 0;
- my %lastrange = ( high => 0, low => 0);
- my $except = 0;
-
- my @chunks = split /\s+/;
- while (my $chunk = shift @chunks) {
-
- # single values
- if ($chunk =~ /^\d{1,3}$/) {
- trash_add($chunk, $except);
- $lastwasrange = 0;
- next;
- }
-
- # ranges
- if ($chunk =~ /^\d{1,3}\.\.\d{1,3}$/) {
- my ($low, $high) = trash_add_range($chunk, $except, \%lastrange);
- $lastwasrange = 1;
- %lastrange = (low => $low, high => $high)
- unless $except;
- next;
- }
-
- # 'except'
- if ($chunk eq 'except') {
- die "Keyword 'except' can only follow a range (line $.)\n"
- unless $lastwasrange;
- die "Keyword 'except' may only occur once per line (line $.)\n"
- if $except;
- $except = 1;
- next;
- }
-
- die "Unknown chunk $chunk in .trashtags file (line $.)\n";
- }
- }
-
- # remove original id sequence tag from trash hash if we know it
- trash_add($conf->{'original-tag'}, 1)
- if ($conf->{'original-tag'} and $conf->{trash}{ $conf->{'original-tag'} });
-}
-
-sub trash_add_range {
- my ($chunk, $except, $range) = @_;
- my ($low,$high) = split /\.\./, $chunk;
- die "Ranges must be 'low..high' ($low is greater than $high on line $.)\n"
- if ($low > $high);
- if ($except) {
- die "Exception ranges must be within last addition range (line $.)\n"
- if ($low < $range->{low} or $high > $range->{high});
- }
- for my $tag ($low..$high) {
- trash_add($tag, $except)
- }
- return $low, $high;
-}
-
-sub trash_add {
- my ($tag, $except) = @_;
- my $trash = $conf->{trash};
-
- die "Trash values must be valid tags (000-999)\n"
- unless ($tag >= 0 and $tag <= 999);
-
- if ($except) {
- delete $trash->{$tag};
- } else {
- die "Trash tag '$tag' specified twice (line $.)\n"
- if $trash->{$tag};
- $trash->{$tag} = 1;
- }
-}
-
#-----------------------------------------------------------------------
=head2 initialize
@@ -660,7 +576,8 @@ sub initialize {
'original-tag|ot=i',
'original-subfield|os=s',
'script',
- 'strip-nines',
+ 'strip9',
+ 'no-strip9',
'trashfile|t=s',
'trashhelp',
'help|h',
@@ -677,10 +594,19 @@ sub initialize {
$c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'};
$c->{window} = 5;
+ if ($c->{trashfile}) {
+ $c->{trash} = Equinox::Migration::SimpleTagList->new($conf->{trashfile})
+ } else {
+ $c->{trash} = Equinox::Migration::SimpleTagList->new;
+ }
+ # remove original id sequence tag from trash hash if we know it
+ $c->{trash}->remove_tag($c->{'original-tag'})
+ if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) );
+
# autotrash 901, 903 if strip-nines
- if ($c->{'strip-nines'}) {
- $c->{trash}{901} = 1;
- $c->{trash}{903} = 1;
+ if ($c->{'strip9'}) {
+ $c->{trash}->add_tag(901);
+ $c->{trash}->add_tag(903);
}
my @keys = keys %{$c};
@@ -708,7 +634,8 @@ Options
--autoscrub -a Automatically remove non-numeric tags in data
--nocollapse -n Don't compress records to one line on output
- --strip-nines Automatically remove any existing 901/903 tags in data
+ --strip9 Automatically remove any existing 901/903 tags in data
+ --no-strip9 Don't complain about 901/903 tags in data
--trashfile -t File containing trash tag data (see --trashhelp)
@@ -720,32 +647,11 @@ exit;
sub show_trashhelp {
print <