seed kmig-clean with mig-clean

[migration-tools.git] / marc_cleanup
diff --git a/marc_cleanup b/marc_cleanup

index 853de84..524109e 100755 (executable)
--- a/marc_cleanup
+++ b/marc_cleanup
@@ -1,4 +1,21 @@
 #!/usr/bin/perl
+
+# Copyright 2009-2012, Equinox Software, Inc.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+
 require 5.10.0;
 
 use strict;
@@ -6,30 +23,36 @@ use warnings;
 
 use Getopt::Long;
 use Term::ReadLine;
+use Equinox::Migration::SimpleTagList;
 
 my $term = new Term::ReadLine 'yaz-cleanup';
 my $OUT = $term->OUT || \*STDOUT;
+binmode STDOUT, ":utf8";
+binmode $OUT, ":utf8";
 
 $| = 1;
 
 # initialization and setup
-my $conf = {};
-initialize($conf);
-populate_trash() if ($conf->{trashfile});
+my $c = {};
+initialize($c);
 
 # set up files, since everything appears to be in order
-open MARC, '<:utf8', (shift || 'incoming.marc.xml')
+open MARC, '<:utf8', $c->{marcfile}
   or die "Can't open input file $!\n";
-open my $NUMARC, '>:utf8', $conf->{output}
+open my $NUMARC, '>:utf8', $c->{output}
   or die "Can't open output file $!\n";
 open my $OLD2NEW, '>', 'old2new.map'
-  if ($conf->{'renumber-from'} and $conf->{'original-subfield'});
+  if ($c->{'renumber-from'} and $c->{'original-tag'});
 my $EXMARC = 'EX';
-print $NUMARC "<collection>\n";
+print $NUMARC "<collection xmlns=\"http://www.loc.gov/MARC21/slim\">\n";
+
+$c->{totalrecs} = `grep -c '<record' $c->{marcfile}`;
+chomp $c->{totalrecs};
+$c->{percent}   = 0;
 
-my @record  = (); # current record storage
-my %recmeta = (); # metadata about current record
-my $ptr  = 0;  # record index pointer
+my @record;  # current record storage
+my %recmeta; # metadata about current record
+my $ptr = 0; # record index pointer
 
 # this is the dispatch table which drives command selection in
 # edit(), below
@@ -53,19 +76,25 @@ my %commands = ( c => \&print_fullcontext,
                  help => \&help,
                );
 
-my @spinner = qw(- / | \\);
+my @spinner = qw(- \\ | /);
 my $sidx = 0;
 
 while ( buildrecord() ) {
-    unless ($conf->{ricount} % 100) {
-        print "\rWorking... ", $spinner[$sidx];
+    unless ($c->{ricount} % 50) {
+        $c->{percent} = int(($c->{ricount} / $c->{totalrecs}) * 100);
+        print "\rWorking (",$c->{percent},"%) ", $spinner[$sidx];
         $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1;
     }
 
-    do_automated_cleanups();
+    my $rc = do_automated_cleanups();
+    next if $rc;
 
     $ptr = 0;
     until ($ptr == $#record) {
+        # get datafield/tag data if we have it
+        $rc = stow_record_data() if ($c->{'renumber-from'} and $c->{'original-tag'});
+        next if $rc;
+
         # naked ampersands
         if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/)
           { edit("Naked ampersand"); $ptr= 0; next }
@@ -74,26 +103,19 @@ while ( buildrecord() ) {
             my $match = $1;
             # tags must be numeric
             if ($match =~ /\D/) {
-                edit("Non-numerics in tag") unless $conf->{autoscrub};
+                edit("Non-numerics in tag") unless $c->{autoscrub};
                 next;
             }
-            # test for existing 901/903 unless we're autocleaning them
-            unless ($conf->{'strip-nines'}) {
-                if ($match == 901 or $match == 903) {
-                    edit("Incoming 901/903 found in data");
-                    next;
-                }
-            }
         }
 
         # subfields can't be non-alphanumeric
         if ($record[$ptr] =~ /<subfield code="(.*?)"/) {
             if ($1 =~ /\P{IsAlnum}/ or $1 eq '') {
-                edit("Junk in subfield code/Null subfield code");
+                edit("Junk in subfield code/Null subfield code ($1)");
                 next;
             }
         }
-        # subfields can't be non-alphanumeric
+        # subfields can't be larger than 1 char (technically you could make the MARC format accomodate that:)
         if ($record[$ptr] =~ /<subfield code="(\w{2,})"/) {
             edit("Subfield code larger than 1 char");
             next;
@@ -104,7 +126,7 @@ while ( buildrecord() ) {
     write_record($NUMARC);
 }
 print $NUMARC "</collection>\n";
-print $OUT "\nDone.               \n";
+print $OUT "\nDone. ",$c->{ricount}," in; ",$c->{rocount}," dumped          \n";
 
 
 #-----------------------------------------------------------------------------------
@@ -114,8 +136,6 @@ print $OUT "\nDone.               \n";
 sub do_automated_cleanups {
     $ptr = 0;
     until ($ptr == $#record) {
-        # get datafield/tag data if we have it
-        stow_record_data();
 
         # catch empty datafield elements
         if ($record[$ptr] =~ m/<datafield tag="..."/) {
@@ -123,6 +143,7 @@ sub do_automated_cleanups {
                 my @a = @record[0 .. $ptr - 1];
                 my @b = @record[$ptr + 2 .. $#record];
                 @record = (@a, @b);
+                @a = undef; @b = undef;
                 message("Empty datafield scrubbed");
                 $ptr = 0;
                 next;
@@ -135,6 +156,7 @@ sub do_automated_cleanups {
                 my @a = @record[0 .. $ptr - 1];
                 my @b = @record[$ptr + 1 .. $#record];
                 @record = (@a, @b);
+                @a = undef; @b = undef;
                 message("Empty subfield scrubbed");
                 $ptr = 0;
                 next;
@@ -155,6 +177,11 @@ sub do_automated_cleanups {
                 message("Short leader padded");
             }
         }
+        if ($c->{'force-utf8'}) {
+            if ($record[$ptr] =~ m|<leader>(.........).(.+)</leader>|) {
+                $record[$ptr] = "<leader>$1a$2</leader>\n";
+            }
+        }
         if ($record[$ptr] =~ m|<controlfield tag="008">(.+?)</control|) {
             #pad short 008
             my $content = $1;
@@ -171,48 +198,93 @@ sub do_automated_cleanups {
             message("Dollar sign corrected");
         }
 
-        # clean up tags with spaces in them
-        $record[$ptr] =~ s/tag="  /tag="00/g;
-        $record[$ptr] =~ s/tag=" /tag="0/g;
-        $record[$ptr] =~ s/tag="-/tag="0/g;
-        $record[$ptr] =~ s/tag="(\d\d) /tag="0$1/g;
+        # excessive trailing whitespace in subfield contents
+        if ($record[$ptr] =~ m|\s{10,}</subfield>|) {
+            $record[$ptr] =~ s|\s{10,}</subfield>|</subfield>|;
+            message("Trailing whitespace trimmed from subfield contents");
+        }
 
         # automatable subfield maladies
-        $record[$ptr] =~ s/code=" ">c/code="c">/;
-        $record[$ptr] =~ s/code=" ">\$/code="c">\$/;
+        if ($record[$ptr] =~ /code=" ">c/) {
+            message('Fixing probable subfield c, scenario 1');
+            $record[$ptr] =~ s/code=" ">c/code="c">/;
+        }
+        if ($record[$ptr] =~ /code=" ">\$/) {
+            message('Fixing probable subfield c, scenario 2');
+            $record[$ptr] =~ s/code=" ">\$/code="c">\$/;
+        }
+
+        if ($c->{'fix-subfield'}) {
+            if ($record[$ptr] =~ /code="&amp;">/) {
+                message('Fixing &amp; for subfield code');
+                $record[$ptr] =~ s/code="&amp;">/code="$c->{'fix-subfield'}">/;
+            }
+            if ($record[$ptr] =~ /code="(.*?\P{IsAlnum}.*?)">/) {
+                message("Fixing non-alphanumeric subfield code: $1 -> " . $c->{'fix-subfield'});
+                $record[$ptr] =~ s/code=".*?\P{IsAlnum}.*?">/code="$c->{'fix-subfield'}">/;
+            }
+            if ($record[$ptr] =~ /code="">/) {
+                message('Fixing null subfield code');
+                $record[$ptr] =~ s/code="">/code="$c->{'fix-subfield'}">/;
+            }
+        }
     }
+    return 0;
 }
 
 sub stow_record_data {
     # get tag data if we're looking at it
-    
-    if ($record[$ptr] =~ m/<datafield tag="(?<TAG>.{3})"/) {
-        $recmeta{tag} = $+{TAG};
-        $record[$ptr] =~ m/ind1="(?<IND1>.)"/;
-        $recmeta{ind1} = $+{IND1} || '';
-        $record[$ptr] =~ m/ind2="(?<IND2>.)"/;
-        $recmeta{ind2} = $+{IND2} || '';
-        
-        unless (defined $recmeta{tag}) {
+    my $tag = 0;
+    if ($record[$ptr] =~ m/<(?:control|data)field tag="(.{3})"/) {
+        $recmeta{tag} = $1;
+        $tag = $recmeta{tag};
+        $record[$ptr] =~ m/ind1="(.)"/;
+        $recmeta{ind1} = $1 || '';
+        $record[$ptr] =~ m/ind2="(.)"/;
+        $recmeta{ind2} = $1 || '';
+
+        unless ($tag) {
             message("Autokill record: no detectable tag");
             dump_record("No detectable tag") ;
+            return 1;
         }
 
         # and since we are looking at a tag, see if it's the original id
-        if ($conf->{'original-subfield'} and
-            $recmeta{tag} == $conf->{'original-tag'}) {
-            my $line = $record[$ptr]; my $lptr = $ptr;
-            my $osub = $conf->{'original-subfield'};
-            $recmeta{oid} = 'NONE';
-
-            until ($line =~ m|</record>|) {
-                $lptr++;
-                $line = $record[$lptr];
-                $recmeta{oid} = $+{TAG}
-                  if ($line =~ /<subfield code="$osub">(.+?)</);
+        if ($tag == $c->{'original-tag'}) {
+            my $oid = 0;
+            if ($tag < 10) {
+                # controlfield
+                if ($record[$ptr] =~ m|<controlfield tag="$tag">(.+?)</controlfield>|)
+                      { $oid = $1; print $OLD2NEW "$oid\t", $recmeta{nid}, "\n" }
+            } elsif ($tag >= 10 and $c->{'original-subfield'}) {
+                # datafield
+                my $line = $record[$ptr]; my $lptr = $ptr;
+                my $osub = $c->{'original-subfield'};
+                # skim to end of this tag
+                until ($line =~ m|</datafield>|) {
+                    if ($line =~ /<subfield code="$osub">(.+?)</)
+                      { $oid = $1; print $OLD2NEW "$oid\t", $recmeta{nid}, "\n" }
+                    $lptr++;
+                    $line = $record[$lptr];
+                }
+            } else {
+                return 0;
+            }
+
+            # didn't find the old id!
+            unless ($oid) {
+                message("Autokill record: no oldid when old2new mapping requested");
+                dump_record("No old id found");
+                return 1;
+            }
+
+            # got it; write to old->new map file
+            if ($c->{'renumber-from'} and $c->{'original-subfield'}) {
             }
+
         }
     }
+    return 0;
 }
 
 #-----------------------------------------------------------------------------------
@@ -228,8 +300,11 @@ Handles the Term::ReadLine loop
 sub edit {
     my ($msg) = @_;
 
-    return if $conf->{trash}{ $recmeta{tag} };
-    $conf->{editmsg} = $msg;
+    return if $c->{trash}->has( $recmeta{tag} );
+    if ( $c->{fullauto} )
+    { dump_record($msg); return }
+
+    $c->{editmsg} = $msg;
     print_fullcontext();
 
     # stow original problem line
@@ -270,85 +345,80 @@ to the driver loop.
 
 sub buildrecord {
     my $l = '';
-    $l = <MARC> while (defined $l and $l !~ /<record>/);
+    my $istrash = 0;
+    my $trash = $c->{trash};
+
+    $l = <MARC> while (defined $l and $l !~ /<record/);
     return $l unless defined $l;
-    @record = ();
-    %recmeta = ();
-    $conf->{ricount}++;
+    $c->{ricount}++;
+
+    for (keys %recmeta) { $recmeta{$_} = undef }
+    for (0 .. @record)  { delete $record[$_] }
+
+    my $i = 0;
+    until ($l =~ m|</record>|) {
+        # clean up tags with spaces in them
+        $l =~ s/tag="  /tag="00/g;
+        $l =~ s/tag=" /tag="0/g;
+        $l =~ s/tag="-/tag="0/g;
+        $l =~ s/tag="(\d\d) /tag="0$1/g;
+
+        # excise unwanted tags
+        if ($istrash) {
+            $istrash = 0 if ($l =~ m|</datafield|);
+            $l = <MARC>;
+            next;
+        }
+        if ($l =~ m/<datafield tag="(.{3})"/) {
+            if ($trash->has($1) or ($c->{autoscrub} and $1 =~ /\D/))
+              { $istrash = 1; next }
+        }
+
+        push @record, $l;
+        $l = <MARC>;
+        $i++;
+    }
+
+    # add 903(?) with new record id
+    if ($c->{'renumber-from'}) {
+        $recmeta{nid} = $c->{'renumber-from'};
+        push @record, join('', ' <datafield tag="', $c->{'renumber-tag'},
+                           '" ind1=" " ind2=" "> <subfield code="',
+                           $c->{'renumber-subfield'},
+                           '">',
+                           $recmeta{nid},
+                           "</subfield></datafield>\n");
+        $c->{'renumber-from'}++;
+    }
+    $i++;
 
-    until ($l =~ m|</record>|) 
-      { push @record, $l; $l = <MARC>; }
     push @record, $l;
     return 1;
 }
 
 sub write_record {
     my ($FH) = @_;
-    my $trash = $conf->{trash};
 
     if ($FH eq 'EX') {
         $EXMARC = undef;
-        open $EXMARC, '>:utf8', $conf->{exception}
+        open $EXMARC, '>:utf8', $c->{exception}
           or die "Can't open exception file $!\n";
         $FH = $EXMARC;
     }
 
-    $conf->{rocount}++ if ($FH eq $NUMARC);
     print $FH '<!-- ', $recmeta{explanation}, " -->\n"
       if(defined $recmeta{explanation});
 
-    # excise unwanted tags
-    if (keys %{$trash} or $conf->{autoscrub}) {
-        my @trimmed = ();
-        my $istrash = 0;
-        for my $line (@record) {
-            if ($istrash) {
-                $istrash = 0 if $line =~ m|</datafield|;
-                next;
-            }
-            if ($line =~ m/<datafield tag="(.{3})"/) {
-                my $tag = $1;
-                if ($trash->{$tag} or ($conf->{autoscrub} and $tag =~ /\D/)) {
-                    $istrash = 1;
-                    next
-                }
-            }
-            push @trimmed, $line;
-        }
-        @record = @trimmed;
-    }
-
-    # add 903(?) with new record id
-    my $renumber = '';
-    if ($conf->{'renumber-from'}) {
-        $recmeta{nid} = $conf->{'renumber-from'};
-        $renumber = join('', ' <datafield tag="', $conf->{'renumber-tag'},
-                         '" ind1=" " ind2=" "> <subfield code="',
-                         $conf->{'renumber-subfield'},
-                         '">', $recmeta{nid}, "</subfield></datafield>\n");
-        my @tmp = @record[0 .. $#record - 1];
-        my $last = $record[$#record];
-        @record = (@tmp, $renumber, $last);
-        $conf->{'renumber-from'}++;
-    }
-
     # scrub newlines (unless told not to or writing exception record)
-    unless ($conf->{nocollapse} or $FH eq $EXMARC)
+    unless ($c->{nocollapse} or $FH eq $EXMARC)
       { s/\n// for (@record) }
 
-    # write to old->new map file if needed
-    if ($conf->{'renumber-from'} and $conf->{'original-subfield'}) {
-        unless (defined $recmeta{oid}) {
-            my $msg = join(' ', "No old id num found");
-            dump_record($msg);
-        } else {
-            print $OLD2NEW $recmeta{oid}, "\t", $recmeta{nid}, "\n"
-        }
-    }
-
     # actually write the record
     print $FH @record,"\n";
 
+    # increment output record count (if not exception)
+    $c->{rocount}++ if ($FH eq $EXMARC);
+
     # if we were dumping to exception file, nuke the record and set ptr
     # to terminate processing loop
     @record = ('a');
@@ -357,17 +427,17 @@ sub write_record {
 
 sub print_fullcontext {
     print $OUT "\r", ' ' x 72, "\n";
-    print $OUT $conf->{editmsg},"\n";
+    print $OUT $c->{editmsg},"\n";
     print $OUT "\r    Tag:",$recmeta{tag}, " Ind1:'",
       $recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'";
-    print $OUT " @ ", $conf->{ricount}, "/", $conf->{rocount} + 1;
+    print $OUT " @ ", $c->{ricount}, "/", $c->{totalrecs};
     print_context();
     return 0;
 }
 
 sub print_context {
-    my $upper = int($conf->{window} / 2) + 1;
-    my $lower = int($conf->{window} / 2) - 1;
+    my $upper = int($c->{window} / 2) + 1;
+    my $lower = int($c->{window} / 2) - 1;
     my $start = ($ptr - $upper < 0) ? 0 : $ptr - $upper;
     my $stop  = ($ptr + $lower > $#record) ? $#record : $ptr + $lower;
     print $OUT "\n";
@@ -380,7 +450,7 @@ sub print_context {
 
 sub message {
     my ($msg) = @_;
-    print $OUT "\r$msg at ",$conf->{ricount},"/",$conf->{rocount} + 1,"\n";
+    print $OUT "\r$msg at ",$c->{ricount},"/",$c->{totalrecs}, "\n";
 }
 
 #-----------------------------------------------------------------------------------
@@ -411,6 +481,7 @@ sub substitute {
 
     $recmeta{prevline} = $record[$ptr];
     $record[$ptr] =~ s/$from/$to/;
+    $ofrom = undef; $to = undef; $from = undef;
     print_context();
     return 0;
 }
@@ -441,6 +512,7 @@ sub flip_line {
     my $temp = $record[$ptr];
     $record[$ptr] = $recmeta{prevline};
     $recmeta{prevline} = $temp;
+    $temp = undef;
     print_context();
     return 0;
 }
@@ -450,6 +522,7 @@ sub kill_line {
     my @a = @record[0 .. $ptr - 1];
     my @b = @record[$ptr + 1 .. $#record];
     @record = (@a, @b);
+    @a = undef; @b = undef;
     print_context();
     return 0;
 }
@@ -459,7 +532,8 @@ sub yank_line {
       { print $OUT "No killed line to yank\n"; return }
     my @a = @record[0 .. $ptr - 1];
     my @b = @record[$ptr .. $#record];
-    @record = (@a, $conf->{killline}, @b);
+    @record = (@a, $c->{killline}, @b);
+    @a = undef; @b = undef;
     print_context();
     return 0;
 }
@@ -480,8 +554,8 @@ sub display_lines {
 
 sub dump_record {
     my (@explanation) = @_;
-    print $OUT @explanation;
-    $recmeta{explanation} = join(' ', 'Tag', $recmeta{tag}, @explanation);
+    $recmeta{explanation} = join(' ', 'DUMPING RECORD: Tag', $recmeta{tag}, @explanation);
+    message( $recmeta{explanation} );
     write_record($EXMARC);
     return 1;
 }
@@ -501,16 +575,16 @@ sub prev_line {
 sub commit_edit { return 1 }
 
 sub widen_window {
-    if ($conf->{window} == 15)
+    if ($c->{window} == 15)
       { print $OUT "Window can't be bigger than 15 lines\n"; return }
-    $conf->{window} += 2;
+    $c->{window} += 2;
     print_context;
 }
 
 sub narrow_window {
-    if ($conf->{window} == 5)
+    if ($c->{window} == 5)
       { print $OUT "Window can't be smaller than 5 lines\n"; return }
-    $conf->{window} -= 2;
+    $c->{window} -= 2;
     print_context;
 }
 
@@ -538,104 +612,6 @@ return 0;
 
 sub quit { exit }
 
-#-----------------------------------------------------------------------------------
-# populate_trash
-#-----------------------------------------------------------------------------------
-# defined a domain-specific language for specifying MARC tags to be dropped from
-# records during processing. it is line oriented, and is specified as follows:
-#
-# each line may specify any number of tags to be included, either singly (\d{1,3})
-# or as a range (\d{1,3}\.\.\d{1,3}
-#
-# if a single number is given, it must be between '000' and '999', inclusive.
-#
-# ranges obey the previous rule, and also the first number of the range must be less
-# than the second number
-#
-# finally, any single range in a line may be followed by the keyword 'except'. every
-# number or range after 'except' is excluded from the range specified. all these
-# numbers must actually be within the range.
-#
-# specifying a tag twice is an error, to help prevent typos
-
-sub populate_trash {
-    print $OUT ">>> TRASHTAGS FILE FOUND. LOADING TAGS TO BE STRIPPED FROM OUTPUT\n";
-    open TRASH, '<', $conf->{trashfile}
-      or die "Can't open trash tags file!\n";
-    while (<TRASH>) {
-        my $lastwasrange = 0;
-        my %lastrange = ( high => 0, low => 0);
-        my $except = 0;
-
-        my @chunks = split /\s+/;
-        while (my $chunk = shift @chunks) {
-
-            # single values
-            if ($chunk =~ /^\d{1,3}$/) {
-                trash_add($chunk, $except);
-                $lastwasrange = 0;
-                next;
-            }
-
-            # ranges
-            if ($chunk =~ /^\d{1,3}\.\.\d{1,3}$/) {
-                my ($low, $high) = trash_add_range($chunk, $except, \%lastrange);
-                $lastwasrange = 1;
-                %lastrange = (low => $low, high => $high)
-                  unless $except;
-                next;
-            }
-
-            # 'except'
-            if ($chunk eq 'except') {
-                die "Keyword 'except' can only follow a range (line $.)\n"
-                  unless $lastwasrange;
-                die "Keyword 'except' may only occur once per line (line $.)\n"
-                  if $except;
-                $except = 1;
-                next;
-            }
-
-            die "Unknown chunk $chunk in .trashtags file (line $.)\n";
-        }
-    }
-
-    # remove original id sequence tag from trash hash if we know it
-    trash_add($conf->{'original-tag'}, 1)
-      if ($conf->{'original-tag'} and $conf->{trash}{ $conf->{'original-tag'} });
-}
-
-sub trash_add_range {
-    my ($chunk, $except, $range) = @_;
-    my ($low,$high) = split /\.\./, $chunk;
-    die "Ranges must be 'low..high' ($low is greater than $high on line $.)\n"
-      if ($low > $high);
-    if ($except) {
-        die "Exception ranges must be within last addition range (line $.)\n"
-          if ($low < $range->{low} or $high > $range->{high});
-    }
-    for my $tag ($low..$high) {
-        trash_add($tag, $except)
-    }
-    return $low, $high;
-}
-
-sub trash_add {
-    my ($tag, $except) = @_;
-    my $trash = $conf->{trash};
-
-    die "Trash values must be valid tags (000-999)\n"
-      unless ($tag >= 0 and $tag <= 999);
-
-    if ($except) {
-        delete $trash->{$tag};
-    } else {
-        die "Trash tag '$tag' specified twice (line $.)\n"
-          if $trash->{$tag};
-        $trash->{$tag} = 1;
-    }
-}
-
 #-----------------------------------------------------------------------
 
 =head2 initialize
@@ -654,8 +630,10 @@ sub initialize {
 
     my $rc = GetOptions( $c,
                          'autoscrub|a',
+                         'fullauto',
                          'exception|x=s',
                          'output|o=s',
+                         'marcfile|m=s',
                          'prefix|p=s',
                          'nocollapse|n',
                          'renumber-from|rf=i',
@@ -663,8 +641,10 @@ sub initialize {
                          'renumber-subfield|rs=s',
                          'original-tag|ot=i',
                          'original-subfield|os=s',
+                         'fix-subfield|fs=s',
+                         'force-utf8',
                          'script',
-                         'strip-nines',
+                         'no-strip9',
                          'trashfile|t=s',
                          'trashhelp',
                          'help|h',
@@ -674,33 +654,62 @@ sub initialize {
     show_trashhelp() if ($c->{trashhelp});
 
     # defaults
-    my $pfx = $c->{prefix} // "bibs";
-    $c->{output} = join('.',$c->{prefix},'clean','marc','xml');
-    $c->{exception} = join('.',$c->{prefix},'exception','marc','xml');
+    my $pfx = defined($c->{prefix}) ? $c->{prefix} : "bibs";
+    $c->{ricount} = 0;
+    $c->{rocount} = 0;
     $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'};
     $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'};
-    $c->{window} = 5;
+    $c->{window} = 9;
+       if ($c->{marcfile} and $c->{prefix}) { abort('You can not declare a marc file and prefix.'); }
+    if ($c->{marcfile}) {
+        $c->{output} = join('.',$c->{marcfile},'clean')
+          unless $c->{output};
+        $c->{exception} = join('.',$c->{marcfile},'exception')
+          unless $c->{exception};
+    }
+    if ($c->{prefix}) {
+        $c->{output} = join('.',$c->{prefix},'clean','marc','xml')
+          unless $c->{output};
+        $c->{exception} = join('.',$c->{prefix},'exception','marc','xml')
+          unless $c->{exception};
+        $c->{marcfile} = $c->{prefix} . '.marc.xml'
+          unless $c->{marcfile};
+    }
+    show_help() unless ($c->{marcfile} and $c->{output});
 
-    # autotrash 901, 903 if strip-nines
-    if ($c->{'strip-nines'}) {
-        $c->{trash}{901} = 1;
-        $c->{trash}{903} = 1;
+    if ($c->{trashfile}) {
+        $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $c->{trashfile})
+    } else {
+        $c->{trash} = Equinox::Migration::SimpleTagList->new;
     }
+    # autotrash 901, 903 unless no strip-nines
+    unless ($c->{'no-strip9'}) {
+        $c->{trash}->add_tag(901);
+        $c->{trash}->add_tag(903);
+    }
+    # remove original id sequence tag from trash hash if we know it
+    $c->{trash}->remove_tag($c->{'original-tag'})
+      if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) );
+}
 
-    my @keys = keys %{$c};
-    show_help() unless (@ARGV and @keys);
+sub abort {
+    my $msg = shift;
+    print STDERR "$0: $msg", "\n";
+    exit 1;
 }
 
 sub show_help {
     print <<HELP;
-Usage is: marc-cleanup [OPTIONS] <filelist>
+Usage is: marc_cleanup [OPTIONS] <filelist>
 Options
   --output     -o  Cleaned MARCXML output filename
   --exception  -x  Exception (dumped records) MARCXML filename
        or
-  --prefix=<PREFIX>>   -p  Shared prefix for output/exception files. Will produce
+  --prefix=<PREFIX>    -p  Shared prefix for output/exception files. Will produce
                            PREFIX.clean.marc.xml and PREFIX.exception.marc.xml
 
+  --marcfile  -m  Input filename. Defaults to PREFIX.marc.xml
+
   --renumber-from     -rf  Begin renumbering id sequence with this number
   --renumber-tag      -rt  Tag to use in renumbering (default: 903)
   --renumber-subfield -rs  Subfield code to use in renumbering (default: a)
@@ -710,46 +719,29 @@ Options
                            and renumbering is in effect, an old-to-new mapping
                            file (old2new.map) will be generated.
 
-  --autoscrub  -a  Automatically remove non-numeric tags in data
-  --nocollapse -n  Don't compress records to one line on output
-  --strip-nines    Automatically remove any existing 901/903 tags in data
-  --trashfile  -t  File containing trash tag data (see --trashhelp)
+  --force-utf8             Rewrite each record so that they describe themselves as
+                           UTF-8 encoded
+  --autoscrub         -a   Automatically remove non-numeric tags in data
+  --fix-subfield      -fs  Subfield code to use in place of non-alphanumeric
+                           or empty subfield codes
+  --nocollapse        -n   Don't compress records to one line on output
+  --no-strip9              Don't autoremove 901/903 tags in data
+  --trashfile         -t   File containing trash tag data (see --trashhelp)
 
+  --fullauto               No manual edits. All problematic records dumped to
+                           exception file.
 
-  --script         Store human-initiated ops in scriptfile (.mcscript)
-                   Not yet implemented
 HELP
 exit;
 }
 
 sub show_trashhelp {
     print <<HELP;
-The marc-cleanup trash tags file is a simple plaintext file. It is a
-line oriented format. There are three basic tokens:
-
-  * The tag
-  * The tag range
-  * The "except" clause
-
-Any number of tags and/or tag ranges can appear on a single line. A
-tag cannot appear twice in the file, either alone or as part of a
-range. This is to prevent errors in the trash tag listing. Items do
-not have to be sorted within a line. These following lines are valid:
-
-  850 852 870..879 886 890 896..899
-  214 696..699 012
-
-Ranges must be ordered internally. That is, "870..879" is valid while
-"879..870" is not.
-
-Finally, there can be only one "except" clause on a line. It is
-composed of the word "except" followed by one or more tags or
-ranges. Except clauses must follow a range, and all tags within the
-clause must be within the range which the clause follows.
+See
 
-  900..997 except 935 950..959 987 994
+http://intra.lan.hq.esilibrary.com/dokuwiki/doku.php?id=migration:tag_files
 
-is a valid example.
+for tag file syntax information.
 HELP
 exit;
 }