added subs thresholding

[migration-tools.git] / marc_cleanup
diff --git a/marc_cleanup b/marc_cleanup

index 85bd059..61c715c 100755 (executable)
--- a/marc_cleanup
+++ b/marc_cleanup
@@ -6,17 +6,18 @@ use warnings;
 
 use Getopt::Long;
 use Term::ReadLine;
+use Equinox::Migration::SimpleTagList;
 
-binmode STDOUT, ":utf8";
 my $term = new Term::ReadLine 'yaz-cleanup';
 my $OUT = $term->OUT || \*STDOUT;
+binmode STDOUT, ":utf8";
+binmode $OUT, ":utf8";
 
 $| = 1;
 
 # initialization and setup
 my $conf = {};
 initialize($conf);
-populate_trash() if ($conf->{trashfile});
 
 # set up files, since everything appears to be in order
 my $marcfile = shift || 'incoming.marc.xml';
@@ -27,15 +28,15 @@ open my $NUMARC, '>:utf8', $conf->{output}
 open my $OLD2NEW, '>', 'old2new.map'
   if ($conf->{'renumber-from'} and $conf->{'original-subfield'});
 my $EXMARC = 'EX';
-print $NUMARC "<collection>\n";
+print $NUMARC "<collection xmlns=\"http://www.loc.gov/MARC21/slim\">\n";
 
 $conf->{totalrecs} = `grep -c '<record' $marcfile`;
 chomp $conf->{totalrecs};
 $conf->{percent}   = 0;
 
-my @record  = (); # current record storage
-my %recmeta = (); # metadata about current record
-my $ptr  = 0;  # record index pointer
+my @record;  # current record storage
+my %recmeta; # metadata about current record
+my $ptr = 0; # record index pointer
 
 # this is the dispatch table which drives command selection in
 # edit(), below
@@ -89,13 +90,6 @@ while ( buildrecord() ) {
                 edit("Non-numerics in tag") unless $conf->{autoscrub};
                 next;
             }
-            # test for existing 901/903 unless we're autocleaning them
-            unless ($conf->{'strip9'} or $conf->{'no-strip9'}) {
-                if ($match == 901 or $match == 903) {
-                    edit("Incoming 901/903 found in data");
-                    next;
-                }
-            }
         }
 
         # subfields can't be non-alphanumeric
@@ -116,7 +110,7 @@ while ( buildrecord() ) {
     write_record($NUMARC);
 }
 print $NUMARC "</collection>\n";
-print $OUT "\nDone.               \n";
+print $OUT "\nDone. ",$conf->{ricount}," in / ",$conf->{rocount}," out          \n";
 
 
 #-----------------------------------------------------------------------------------
@@ -126,12 +120,14 @@ print $OUT "\nDone.               \n";
 sub do_automated_cleanups {
     $ptr = 0;
     until ($ptr == $#record) {
+
         # catch empty datafield elements
         if ($record[$ptr] =~ m/<datafield tag="..."/) {
             if ($record[$ptr + 1] =~ m|</datafield>|) {
                 my @a = @record[0 .. $ptr - 1];
                 my @b = @record[$ptr + 2 .. $#record];
                 @record = (@a, @b);
+                @a = undef; @b = undef;
                 message("Empty datafield scrubbed");
                 $ptr = 0;
                 next;
@@ -144,6 +140,7 @@ sub do_automated_cleanups {
                 my @a = @record[0 .. $ptr - 1];
                 my @b = @record[$ptr + 1 .. $#record];
                 @record = (@a, @b);
+                @a = undef; @b = undef;
                 message("Empty subfield scrubbed");
                 $ptr = 0;
                 next;
@@ -180,12 +177,6 @@ sub do_automated_cleanups {
             message("Dollar sign corrected");
         }
 
-        # clean up tags with spaces in them
-        $record[$ptr] =~ s/tag="  /tag="00/g;
-        $record[$ptr] =~ s/tag=" /tag="0/g;
-        $record[$ptr] =~ s/tag="-/tag="0/g;
-        $record[$ptr] =~ s/tag="(\d\d) /tag="0$1/g;
-
         # automatable subfield maladies
         $record[$ptr] =~ s/code=" ">c/code="c">/;
         $record[$ptr] =~ s/code=" ">\$/code="c">\$/;
@@ -214,7 +205,8 @@ sub stow_record_data {
             my $osub = $conf->{'original-subfield'};
             $recmeta{oid} = 'NONE';
 
-            until ($line =~ m|</record>|) {
+            # skim to end of this tag
+            until ($line =~ m|</datafield>|) {
                 if ($line =~ /<subfield code="$osub">(.+?)</)
                   { $recmeta{oid} = $1 }
                 $lptr++;
@@ -243,7 +235,7 @@ Handles the Term::ReadLine loop
 sub edit {
     my ($msg) = @_;
 
-    return if $conf->{trash}{ $recmeta{tag} };
+    return if $conf->{trash}->has( $recmeta{tag} );
     $conf->{editmsg} = $msg;
     print_fullcontext();
 
@@ -285,21 +277,45 @@ to the driver loop.
 
 sub buildrecord {
     my $l = '';
+    my $istrash = 0;
+    my $trash = $conf->{trash};
+
     $l = <MARC> while (defined $l and $l !~ /<record>/);
     return $l unless defined $l;
-    @record = ();
-    %recmeta = ();
     $conf->{ricount}++;
 
-    until ($l =~ m|</record>|) 
-      { push @record, $l; $l = <MARC>; }
-    push @record, $l;
+    for (keys %recmeta) { $recmeta{$_} = undef }
+    for (0 .. @record)  { delete $record[$_] }
+
+    my $i = 0;
+    until ($l =~ m|</record>|) {
+        # clean up tags with spaces in them
+        $l =~ s/tag="  /tag="00/g;
+        $l =~ s/tag=" /tag="0/g;
+        $l =~ s/tag="-/tag="0/g;
+        $l =~ s/tag="(\d\d) /tag="0$1/g;
+
+        # excise unwanted tags
+        if ($istrash) {
+            $istrash = 0 if ($l =~ m|</datafield|);
+            $l = <MARC>;
+            next;
+        }
+        if ($l =~ m/<datafield tag="(.{3})"/) {
+            if ($trash->has($1) or ($conf->{autoscrub} and $1 =~ /\D/))
+              { $istrash = 1; next }
+        }
+
+        $record[$i] = $l;
+        $l = <MARC>;
+        $i++;
+    }
+    $record[$i] = $l;
     return 1;
 }
 
 sub write_record {
     my ($FH) = @_;
-    my $trash = $conf->{trash};
 
     if ($FH eq 'EX') {
         $EXMARC = undef;
@@ -312,27 +328,6 @@ sub write_record {
     print $FH '<!-- ', $recmeta{explanation}, " -->\n"
       if(defined $recmeta{explanation});
 
-    # excise unwanted tags
-    if (keys %{$trash} or $conf->{autoscrub}) {
-        my @trimmed = ();
-        my $istrash = 0;
-        for my $line (@record) {
-            if ($istrash) {
-                $istrash = 0 if $line =~ m|</datafield|;
-                next;
-            }
-            if ($line =~ m/<datafield tag="(.{3})"/) {
-                my $tag = $1;
-                if ($trash->{$tag} or ($conf->{autoscrub} and $tag =~ /\D/)) {
-                    $istrash = 1;
-                    next
-                }
-            }
-            push @trimmed, $line;
-        }
-        @record = @trimmed;
-    }
-
     # add 903(?) with new record id
     my $renumber = '';
     if ($conf->{'renumber-from'}) {
@@ -341,9 +336,11 @@ sub write_record {
                          '" ind1=" " ind2=" "> <subfield code="',
                          $conf->{'renumber-subfield'},
                          '">', $recmeta{nid}, "</subfield></datafield>\n");
-        my @tmp = @record[0 .. $#record - 1];
-        my $last = $record[$#record];
+        my @tmp = @record[0 .. @record - 2];
+        my $last = $record[-1];
+        @record = undef;
         @record = (@tmp, $renumber, $last);
+        @tmp = undef; $last = undef;
         $conf->{'renumber-from'}++;
     }
 
@@ -421,6 +418,7 @@ sub substitute {
 
     $recmeta{prevline} = $record[$ptr];
     $record[$ptr] =~ s/$from/$to/;
+    $ofrom = undef; $to = undef; $from = undef;
     print_context();
     return 0;
 }
@@ -451,6 +449,7 @@ sub flip_line {
     my $temp = $record[$ptr];
     $record[$ptr] = $recmeta{prevline};
     $recmeta{prevline} = $temp;
+    $temp = undef;
     print_context();
     return 0;
 }
@@ -460,6 +459,7 @@ sub kill_line {
     my @a = @record[0 .. $ptr - 1];
     my @b = @record[$ptr + 1 .. $#record];
     @record = (@a, @b);
+    @a = undef; @b = undef;
     print_context();
     return 0;
 }
@@ -470,6 +470,7 @@ sub yank_line {
     my @a = @record[0 .. $ptr - 1];
     my @b = @record[$ptr .. $#record];
     @record = (@a, $conf->{killline}, @b);
+    @a = undef; @b = undef;
     print_context();
     return 0;
 }
@@ -492,6 +493,7 @@ sub dump_record {
     my (@explanation) = @_;
     print $OUT @explanation;
     $recmeta{explanation} = join(' ', 'Tag', $recmeta{tag}, @explanation);
+    @explanation = undef;
     write_record($EXMARC);
     return 1;
 }
@@ -548,104 +550,6 @@ return 0;
 
 sub quit { exit }
 
-#-----------------------------------------------------------------------------------
-# populate_trash
-#-----------------------------------------------------------------------------------
-# defined a domain-specific language for specifying MARC tags to be dropped from
-# records during processing. it is line oriented, and is specified as follows:
-#
-# each line may specify any number of tags to be included, either singly (\d{1,3})
-# or as a range (\d{1,3}\.\.\d{1,3}
-#
-# if a single number is given, it must be between '000' and '999', inclusive.
-#
-# ranges obey the previous rule, and also the first number of the range must be less
-# than the second number
-#
-# finally, any single range in a line may be followed by the keyword 'except'. every
-# number or range after 'except' is excluded from the range specified. all these
-# numbers must actually be within the range.
-#
-# specifying a tag twice is an error, to help prevent typos
-
-sub populate_trash {
-    print $OUT ">>> TRASHTAGS FILE FOUND. LOADING TAGS TO BE STRIPPED FROM OUTPUT\n";
-    open TRASH, '<', $conf->{trashfile}
-      or die "Can't open trash tags file!\n";
-    while (<TRASH>) {
-        my $lastwasrange = 0;
-        my %lastrange = ( high => 0, low => 0);
-        my $except = 0;
-
-        my @chunks = split /\s+/;
-        while (my $chunk = shift @chunks) {
-
-            # single values
-            if ($chunk =~ /^\d{1,3}$/) {
-                trash_add($chunk, $except);
-                $lastwasrange = 0;
-                next;
-            }
-
-            # ranges
-            if ($chunk =~ /^\d{1,3}\.\.\d{1,3}$/) {
-                my ($low, $high) = trash_add_range($chunk, $except, \%lastrange);
-                $lastwasrange = 1;
-                %lastrange = (low => $low, high => $high)
-                  unless $except;
-                next;
-            }
-
-            # 'except'
-            if ($chunk eq 'except') {
-                die "Keyword 'except' can only follow a range (line $.)\n"
-                  unless $lastwasrange;
-                die "Keyword 'except' may only occur once per line (line $.)\n"
-                  if $except;
-                $except = 1;
-                next;
-            }
-
-            die "Unknown chunk $chunk in .trashtags file (line $.)\n";
-        }
-    }
-
-    # remove original id sequence tag from trash hash if we know it
-    trash_add($conf->{'original-tag'}, 1)
-      if ($conf->{'original-tag'} and $conf->{trash}{ $conf->{'original-tag'} });
-}
-
-sub trash_add_range {
-    my ($chunk, $except, $range) = @_;
-    my ($low,$high) = split /\.\./, $chunk;
-    die "Ranges must be 'low..high' ($low is greater than $high on line $.)\n"
-      if ($low > $high);
-    if ($except) {
-        die "Exception ranges must be within last addition range (line $.)\n"
-          if ($low < $range->{low} or $high > $range->{high});
-    }
-    for my $tag ($low..$high) {
-        trash_add($tag, $except)
-    }
-    return $low, $high;
-}
-
-sub trash_add {
-    my ($tag, $except) = @_;
-    my $trash = $conf->{trash};
-
-    die "Trash values must be valid tags (000-999)\n"
-      unless ($tag >= 0 and $tag <= 999);
-
-    if ($except) {
-        delete $trash->{$tag};
-    } else {
-        die "Trash tag '$tag' specified twice (line $.)\n"
-          if $trash->{$tag};
-        $trash->{$tag} = 1;
-    }
-}
-
 #-----------------------------------------------------------------------
 
 =head2 initialize
@@ -674,13 +578,12 @@ sub initialize {
                          'original-tag|ot=i',
                          'original-subfield|os=s',
                          'script',
-                         'strip9',
                          'no-strip9',
                          'trashfile|t=s',
                          'trashhelp',
                          'help|h',
                        );
-    show_help() unless $rc;
+    show_help() unless $rc and @ARGV;
     show_help() if ($c->{help});
     show_trashhelp() if ($c->{trashhelp});
 
@@ -690,13 +593,21 @@ sub initialize {
     $c->{exception} = join('.',$c->{prefix},'exception','marc','xml');
     $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'};
     $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'};
-    $c->{window} = 5;
+    $c->{window} = 9;
 
-    # autotrash 901, 903 if strip-nines
-    if ($c->{'strip9'}) {
-        $c->{trash}{901} = 1;
-        $c->{trash}{903} = 1;
+    if ($c->{trashfile}) {
+        $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $conf->{trashfile})
+    } else {
+        $c->{trash} = Equinox::Migration::SimpleTagList->new;
     }
+    # autotrash 901, 903 unless no strip-nines
+    unless ($c->{'no-strip9'}) {
+        $c->{trash}->add_tag(901);
+        $c->{trash}->add_tag(903);
+    }
+    # remove original id sequence tag from trash hash if we know it
+    $c->{trash}->remove_tag($c->{'original-tag'})
+      if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) );
 
     my @keys = keys %{$c};
     show_help() unless (@ARGV and @keys);
@@ -723,7 +634,7 @@ Options
 
   --autoscrub  -a  Automatically remove non-numeric tags in data
   --nocollapse -n  Don't compress records to one line on output
-  --strip9         Automatically remove any existing 901/903 tags in data (reversible)
+  --no-strip9      Don't autoremove 901/903 tags in data
   --trashfile  -t  File containing trash tag data (see --trashhelp)
 
 
@@ -735,32 +646,11 @@ exit;
 
 sub show_trashhelp {
     print <<HELP;
-The marc-cleanup trash tags file is a simple plaintext file. It is a
-line oriented format. There are three basic tokens:
-
-  * The tag
-  * The tag range
-  * The "except" clause
-
-Any number of tags and/or tag ranges can appear on a single line. A
-tag cannot appear twice in the file, either alone or as part of a
-range. This is to prevent errors in the trash tag listing. Items do
-not have to be sorted within a line. These following lines are valid:
-
-  850 852 870..879 886 890 896..899
-  214 696..699 012
-
-Ranges must be ordered internally. That is, "870..879" is valid while
-"879..870" is not.
-
-Finally, there can be only one "except" clause on a line. It is
-composed of the word "except" followed by one or more tags or
-ranges. Except clauses must follow a range, and all tags within the
-clause must be within the range which the clause follows.
+See
 
-  900..997 except 935 950..959 987 994
+http://intra.lan.hq.esilibrary.com/dokuwiki/doku.php?id=migration:tag_files
 
-is a valid example.
+for tag file syntax information.
 HELP
 exit;
 }