marc_cleanup

   1 #!/usr/bin/perl
   2 require 5.10.0;
   3
   4 use strict;
   5 use warnings;
   6
   7 use Getopt::Long;
   8 use Term::ReadLine;
   9
  10 my $term = new Term::ReadLine 'yaz-cleanup';
  11 my $OUT = $term->OUT || \*STDOUT;
  12
  13 $| = 1;
  14
  15 # initialization and setup
  16 my $conf = {};
  17 initialize($conf);
  18 populate_trash() if ($conf->{trashfile});
  19
  20 # set up files, since everything appears to be in order
  21 open MARC, '<:utf8', (shift || 'incoming.marc.xml')
  22   or die "Can't open input file $!\n";
  23 open my $NUMARC, '>:utf8', $conf->{output}
  24   or die "Can't open output file $!\n";
  25 open my $OLD2NEW, '>', 'old2new.map'
  26   if ($conf->{'renumber-from'} and $conf->{'original-subfield'});
  27 my $EXMARC = 'EX';
  28
  29
  30 my @record  = (); # current record storage
  31 my %recmeta = (); # metadata about current record
  32 my $ptr  = 0;  # record index pointer
  33
  34 # this is the dispatch table which drives command selection in
  35 # edit(), below
  36 my %commands = ( c => \&print_fullcontext,
  37                  n => \&next_line,
  38                  p => \&prev_line,
  39                  '<' => \&widen_window,
  40                  '>' => \&narrow_window,
  41                  d => \&display_lines,
  42                  o => \&insert_original,
  43                  k => \&kill_line,
  44                  y => \&yank_line,
  45                  f => \&flip_line,
  46                  m => \&merge_lines,
  47                  s => \&substitute,
  48                  t => \&commit_edit,
  49                  x => \&dump_record,
  50                  q => \&quit,
  51                  '?' => \&help,
  52                  h   => \&help,
  53                  help => \&help,
  54                );
  55
  56 my @spinner = qw(- / | \\);
  57 my $sidx = 0;
  58
  59 while ( buildrecord() ) {
  60     unless ($conf->{ricount} % 100) {
  61         print "\rWorking... ", $spinner[$sidx];
  62         $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1;
  63     }
  64
  65     do_automated_cleanups();
  66
  67     $ptr = 0;
  68     until ($ptr == $#record) {
  69         # naked ampersands
  70         if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/)
  71           { edit("Naked ampersand"); $ptr= 0; next }
  72
  73         if ($record[$ptr] =~ /<datafield tag="(.+?)"/) {
  74             my $match = $1;
  75             # tags must be numeric
  76             if ($match =~ /\D/) {
  77                 edit("Non-numerics in tag") unless $conf->{autoscrub};
  78                 next;
  79             }
  80             # test for existing 901/903 unless we're autocleaning them
  81             unless ($conf->{'strip-nines'}) {
  82                 if ($match == 901 or $match == 903) {
  83                     edit("Incoming 901/903 found in data");
  84                     next;
  85                 }
  86             }
  87         }
  88
  89         # subfields can't be non-alphanumeric
  90         if ($record[$ptr] =~ /<subfield code="(.*?)"/) {
  91             my $match = $1;
  92             if ($match =~ /\P{IsAlnum}/ or $match eq '') {
  93                 edit("Junk in subfield code/Null subfield code");
  94                 next;
  95             }
  96         }
  97
  98         $ptr++;
  99     }
 100     write_record($NUMARC);
 101 }
 102 #print $NUMARC "</collection>\n";
 103 print $OUT "\nDone.               \n";
 104
 105
 106 #-----------------------------------------------------------------------------------
 107 # cleanup routines
 108 #-----------------------------------------------------------------------------------
 109
 110 sub do_automated_cleanups {
 111     $ptr = 0;
 112     until ($ptr == $#record) {
 113         # get datafield/tag data if we have it
 114         stow_record_data();
 115
 116         # catch empty datafield elements
 117         if ($record[$ptr] =~ m/<datafield tag="..."/) {
 118             if ($record[$ptr + 1] =~ m|</datafield>|) {
 119                 my @a = @record[0 .. $ptr - 1];
 120                 my @b = @record[$ptr + 2 .. $#record];
 121                 @record = (@a, @b);
 122                 message("Empty datafield scrubbed");
 123                 $ptr = 0;
 124                 next;
 125             }
 126         }
 127         # and quasi-empty subfields
 128         if ($record[$ptr] =~ m|<subfield code="(.*?)">(.*?)</sub|) {
 129             my $code = $1; my $content = $2;
 130             if ($code =~ /\W/ and ($content =~ /\s+/ or $content eq '')) {
 131                 my @a = @record[0 .. $ptr - 1];
 132                 my @b = @record[$ptr + 1 .. $#record];
 133                 @record = (@a, @b);
 134                 message("Empty subfield scrubbed");
 135                 $ptr = 0;
 136                 next;
 137             }
 138         }
 139         $ptr++;
 140     }
 141
 142     # single-line fixes
 143     for $ptr (0 .. $#record) {
 144         # pad short leaders
 145         if ($record[$ptr] =~ m|<leader>(.+?)</leader>|) {
 146             my $leader = $1;
 147             if (length $leader < 24) {
 148                 $leader .= ' ' x (20 - length($leader));
 149                 $leader .= "4500";
 150                 $record[$ptr] = "<leader>$leader</leader>\n";
 151                 message("Short leader padded");
 152             }
 153         }
 154         if ($record[$ptr] =~ m|<controlfield tag="008">(.+?)</control|) {
 155             #pad short 008
 156             my $content = $1;
 157             if (length $content < 40) {
 158                 $content .= ' ' x (40 - length($content));
 159                 $record[$ptr] = "<controlfield tag=\"008\">$content</controlfield>\n";
 160                 message("Short 008 padded");
 161             }
 162         }
 163
 164         # clean misplaced dollarsigns
 165         if ($record[$ptr] =~ m|<subfield code="\$">c?\d+\.\d{2}|) {
 166             $record[$ptr] =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|;
 167             message("Dollar sign corrected");
 168         }
 169
 170         # clean up tags with spaces in them
 171         $record[$ptr] =~ s/tag="  /tag="00/g;
 172         $record[$ptr] =~ s/tag=" /tag="0/g;
 173         $record[$ptr] =~ s/tag="-/tag="0/g;
 174         $record[$ptr] =~ s/tag="(\d\d) /tag="0$1/g;
 175
 176         # automatable subfield maladies
 177         $record[$ptr] =~ s/code=" ">c/code="c">/;
 178         $record[$ptr] =~ s/code=" ">\$/code="c">\$/;
 179     }
 180 }
 181
 182 sub stow_record_data {
 183     # get tag data if we're looking at it
 184
 185     if ($record[$ptr] =~ m/<datafield tag="(?<TAG>.{3})"/) {
 186         $recmeta{tag} = $+{TAG};
 187         $record[$ptr] =~ m/ind1="(?<IND1>.)"/;
 188         $recmeta{ind1} = $+{IND1} || '';
 189         $record[$ptr] =~ m/ind2="(?<IND2>.)"/;
 190         $recmeta{ind2} = $+{IND2} || '';
 191
 192         unless (defined $recmeta{tag}) {
 193             message("Autokill record: no detectable tag");
 194             dump_record("No detectable tag") ;
 195         }
 196
 197         # and since we are looking at a tag, see if it's the original id
 198         if ($conf->{'original-subfield'} and
 199             $recmeta{tag} == $conf->{'original-tag'}) {
 200             my $line = $record[$ptr]; my $lptr = $ptr;
 201             my $osub = $conf->{'original-subfield'};
 202             $recmeta{oid} = 'NONE';
 203
 204             until ($line =~ m|</record>|) {
 205                 $lptr++;
 206                 $line = $record[$lptr];
 207                 $recmeta{oid} = $+{TAG}
 208                   if ($line =~ /<subfield code="$osub">(.+?)</);
 209             }
 210         }
 211     }
 212 }
 213
 214 #-----------------------------------------------------------------------------------
 215 # driver routines
 216 #-----------------------------------------------------------------------------------
 217
 218 =head2 edit
 219
 220 Handles the Term::ReadLine loop
 221
 222 =cut
 223
 224 sub edit {
 225     my ($msg) = @_;
 226
 227     return if $conf->{trash}{ $recmeta{tag} };
 228     $conf->{editmsg} = $msg;
 229     print_fullcontext();
 230
 231     # stow original problem line
 232     $recmeta{origline} = $record[$ptr];
 233
 234     while (1) {
 235         my $line = $term->readline('marc-cleanup>');
 236         my @chunks = split /\s+/, $line;
 237
 238         # lines with single-character first chunks are commands.
 239         # make sure they exist.
 240         if (length $chunks[0] == 1) {
 241             unless (defined $commands{$chunks[0]}) {
 242                 print $OUT "No such command '", $chunks[0], "'\n";
 243                 next;
 244             }
 245         }
 246
 247         if (defined $commands{$chunks[0]}) {
 248             my $term = $commands{$chunks[0]}->(@chunks[1..$#chunks]);
 249             last if $term;
 250         } else {
 251             $recmeta{prevline} = $record[$ptr];
 252             $record[$ptr] = "$line\n";
 253             print_context();
 254         }
 255     }
 256     # set pointer to top on the way out
 257     $ptr = 0;
 258 }
 259
 260 =head2 buildrecord
 261
 262 Constructs record arrays from the incoming MARC file and returns them
 263 to the driver loop.
 264
 265 =cut
 266
 267 sub buildrecord {
 268     my $l = '';
 269     $l = <MARC> while (defined $l and $l !~ /<record>/);
 270     return $l unless defined $l;
 271     @record = ();
 272     %recmeta = ();
 273     $conf->{ricount}++;
 274
 275     until ($l =~ m|</record>|)
 276       { push @record, $l; $l = <MARC>; }
 277     push @record, $l;
 278     return 1;
 279 }
 280
 281 sub write_record {
 282     my ($FH) = @_;
 283     my $trash = $conf->{trash};
 284
 285     if ($FH eq 'EX') {
 286         $EXMARC = undef;
 287         open $EXMARC, '>:utf8', $conf->{exception}
 288           or die "Can't open exception file $!\n";
 289         $FH = $EXMARC;
 290     }
 291
 292     $conf->{rocount}++ if ($FH eq $NUMARC);
 293     print $FH '<!-- ', $recmeta{explanation}, " -->\n"
 294       if(defined $recmeta{explanation});
 295
 296     # excise unwanted tags
 297     if (keys %{$trash} or $conf->{autoscrub}) {
 298         my @trimmed = ();
 299         my $istrash = 0;
 300         for my $line (@record) {
 301             if ($istrash) {
 302                 $istrash = 0 if $line =~ m|</datafield|;
 303                 next;
 304             }
 305             if ($line =~ m/<datafield tag="(.{3})"/) {
 306                 my $tag = $1;
 307                 if ($trash->{$tag} or ($conf->{autoscrub} and $tag =~ /\D/)) {
 308                     $istrash = 1;
 309                     next
 310                 }
 311             }
 312             push @trimmed, $line;
 313         }
 314         @record = @trimmed;
 315     }
 316
 317     # add 903(?) with new record id
 318     my $renumber = '';
 319     if ($conf->{'renumber-from'}) {
 320         $recmeta{nid} = $conf->{'renumber-from'};
 321         $renumber = join('', ' <datafield tag="', $conf->{'renumber-tag'},
 322                          '" ind1=" " ind2=" "> <subfield code="',
 323                          $conf->{'renumber-subfield'},
 324                          '">', $recmeta{nid}, "</subfield></datafield>\n");
 325         my @tmp = @record[0 .. $#record - 1];
 326         my $last = $record[$#record];
 327         @record = (@tmp, $renumber, $last);
 328         $conf->{'renumber-from'}++;
 329     }
 330
 331     # scrub newlines (unless told not to or writing exception record)
 332     unless ($conf->{nocollapse} or $FH eq $EXMARC)
 333       { s/\n// for (@record) }
 334
 335     # write to old->new map file if needed
 336     if ($conf->{'renumber-from'} and $conf->{'original-subfield'}) {
 337         unless (defined $recmeta{oid}) {
 338             my $msg = join(' ', "No old id num found");
 339             dump_record($msg);
 340         } else {
 341             print $OLD2NEW $recmeta{oid}, "\t", $recmeta{nid}, "\n"
 342         }
 343     }
 344
 345     # actually write the record
 346     print $FH @record,"\n";
 347
 348     # if we were dumping to exception file, nuke the record and set ptr
 349     # to terminate processing loop
 350     @record = ('a');
 351     $ptr = 0;
 352 }
 353
 354 sub print_fullcontext {
 355     print $OUT "\r", ' ' x 72, "\n";
 356     print $OUT $conf->{editmsg},"\n";
 357     print $OUT "\r    Tag:",$recmeta{tag}, " Ind1:'",
 358       $recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'";
 359     print $OUT " @ ", $conf->{ricount}, "/", $conf->{rocount} + 1;
 360     print_context();
 361     return 0;
 362 }
 363
 364 sub print_context {
 365     my $upper = int($conf->{window} / 2) + 1;
 366     my $lower = int($conf->{window} / 2) - 1;
 367     my $start = ($ptr - $upper < 0) ? 0 : $ptr - $upper;
 368     my $stop  = ($ptr + $lower > $#record) ? $#record : $ptr + $lower;
 369     print $OUT "\n";
 370     print $OUT '    |', $record[$_] for ($start .. $ptr - 1);
 371     print $OUT '==> |', $record[$ptr];
 372     print $OUT '    |', $record[$_] for ($ptr + 1 .. $stop);
 373     print $OUT "\n";
 374     return 0;
 375 }
 376
 377 sub message {
 378     my ($msg) = @_;
 379     print $OUT "\r$msg at ",$conf->{ricount},"/",$conf->{rocount} + 1,"\n";
 380 }
 381
 382 #-----------------------------------------------------------------------------------
 383 # command routines
 384 #-----------------------------------------------------------------------------------
 385
 386 sub substitute {
 387     my (@chunks) = @_;
 388
 389     my $ofrom = shift @chunks;
 390     if ($ofrom =~ /^'/) {
 391         until ($ofrom =~ /'$/ or !@chunks)
 392           { $ofrom .= join(' ','',shift @chunks) }
 393         $ofrom =~ s/^'//; $ofrom =~ s/'$//;
 394     }
 395     my $to = shift @chunks;
 396     if ($to =~ /^'/) {
 397         until ($to =~ /'$/ or !@chunks)
 398           { $to .= join(' ','',shift @chunks) }
 399         $to =~ s/^'//; $to =~ s/'$//;
 400     }
 401
 402     my $from = '';
 403     for my $char (split(//,$ofrom)) {
 404         $char = "\\" . $char if ($char =~ /\W/);
 405         $from = join('', $from, $char);
 406     }
 407
 408     $recmeta{prevline} = $record[$ptr];
 409     $record[$ptr] =~ s/$from/$to/;
 410     print_context();
 411     return 0;
 412 }
 413
 414 sub merge_lines {
 415     $recmeta{prevline} = $record[$ptr];
 416     # remove <subfield stuff; extract (probably wrong) subfield code
 417     $record[$ptr] =~ s/^\s*<subfield code="(.*?)">//;
 418     # and move to front of line
 419     $record[$ptr] = join(' ', $1 , $record[$ptr]);
 420     # tear off trailing subfield tag from preceeding line
 421     $record[$ptr - 1] =~ s|</subfield>\n||;
 422     # join current line onto preceeding line
 423     $record[$ptr - 1] = join('', $record[$ptr - 1], $record[$ptr]);
 424     # erase current line
 425     my @a = @record[0 .. $ptr - 1];
 426     my @b = @record[$ptr + 1 .. $#record];
 427     @record = (@a, @b);
 428     # move record pointer to previous line
 429     prev_line();
 430     print_context();
 431     return 0;
 432 }
 433
 434 sub flip_line {
 435     unless ($recmeta{prevline})
 436       { print $OUT "No previously edited line to flip\n"; return }
 437     my $temp = $record[$ptr];
 438     $record[$ptr] = $recmeta{prevline};
 439     $recmeta{prevline} = $temp;
 440     print_context();
 441     return 0;
 442 }
 443
 444 sub kill_line {
 445     $recmeta{killline} = $record[$ptr];
 446     my @a = @record[0 .. $ptr - 1];
 447     my @b = @record[$ptr + 1 .. $#record];
 448     @record = (@a, @b);
 449     print_context();
 450     return 0;
 451 }
 452
 453 sub yank_line {
 454     unless ($recmeta{killline})
 455       { print $OUT "No killed line to yank\n"; return }
 456     my @a = @record[0 .. $ptr - 1];
 457     my @b = @record[$ptr .. $#record];
 458     @record = (@a, $conf->{killline}, @b);
 459     print_context();
 460     return 0;
 461 }
 462
 463 sub insert_original {
 464     $record[$ptr] = $recmeta{origline};
 465     print_context();
 466     return 0;
 467 }
 468
 469 sub display_lines {
 470     print $OUT "\nOrig. edit line  :", $recmeta{origline};
 471     print $OUT "Current flip line:", $recmeta{prevline} if $recmeta{prevline};
 472     print $OUT "Last killed line :", $recmeta{killline} if $recmeta{killline};
 473     print $OUT "\n";
 474     return 0;
 475 }
 476
 477 sub dump_record {
 478     my (@explanation) = @_;
 479     print $OUT @explanation;
 480     $recmeta{explanation} = join(' ', 'Tag', $recmeta{tag}, @explanation);
 481     write_record($EXMARC);
 482     return 1;
 483 }
 484
 485 sub next_line {
 486     $ptr++ unless ($ptr == $#record);;
 487     print_context();
 488     return 0;
 489 }
 490
 491 sub prev_line {
 492     $ptr-- unless ($ptr == 0);
 493     print_context();
 494     return 0;
 495 }
 496
 497 sub commit_edit { return 1 }
 498
 499 sub widen_window {
 500     if ($conf->{window} == 15)
 501       { print $OUT "Window can't be bigger than 15 lines\n"; return }
 502     $conf->{window} += 2;
 503     print_context;
 504 }
 505
 506 sub narrow_window {
 507     if ($conf->{window} == 5)
 508       { print $OUT "Window can't be smaller than 5 lines\n"; return }
 509     $conf->{window} -= 2;
 510     print_context;
 511 }
 512
 513 sub help {
 514 print $OUT <<HELP;
 515 Type a replacement for the indicated line, or enter a command.
 516
 517 DISPLAY COMMANDS             | LINE AUTO-EDIT COMMANDS
 518 <  Expand context window     | k  Kill current line
 519 >  Contract context window   | y  Yank last killed line
 520 p  Move pointer to prev line | m  Merge current line into preceding line
 521 n  Move pointer to next line | o  Insert original line
 522 c  Print line context        | f  Flip current line and last edited line
 523 d  Print current saved lines |
 524 -----------------------------+-------------------------------------------
 525 s  Subtitute; replace ARG1 in current line with ARG2. If either ARG
 526    contains spaces, it must be single-quoted
 527 t  Commit changes and resume automated operations
 528 x  Dump record to exception file
 529 q  Quit
 530
 531 HELP
 532 return 0;
 533 }
 534
 535 sub quit { exit }
 536
 537 #-----------------------------------------------------------------------------------
 538 # populate_trash
 539 #-----------------------------------------------------------------------------------
 540 # defined a domain-specific language for specifying MARC tags to be dropped from
 541 # records during processing. it is line oriented, and is specified as follows:
 542 #
 543 # each line may specify any number of tags to be included, either singly (\d{1,3})
 544 # or as a range (\d{1,3}\.\.\d{1,3}
 545 #
 546 # if a single number is given, it must be between '000' and '999', inclusive.
 547 #
 548 # ranges obey the previous rule, and also the first number of the range must be less
 549 # than the second number
 550 #
 551 # finally, any single range in a line may be followed by the keyword 'except'. every
 552 # number or range after 'except' is excluded from the range specified. all these
 553 # numbers must actually be within the range.
 554 #
 555 # specifying a tag twice is an error, to help prevent typos
 556
 557 sub populate_trash {
 558     print $OUT ">>> TRASHTAGS FILE FOUND. LOADING TAGS TO BE STRIPPED FROM OUTPUT\n";
 559     open TRASH, '<', $conf->{trashfile}
 560       or die "Can't open trash tags file!\n";
 561     while (<TRASH>) {
 562         my $lastwasrange = 0;
 563         my %lastrange = ( high => 0, low => 0);
 564         my $except = 0;
 565
 566         my @chunks = split /\s+/;
 567         while (my $chunk = shift @chunks) {
 568
 569             # single values
 570             if ($chunk =~ /^\d{1,3}$/) {
 571                 trash_add($chunk, $except);
 572                 $lastwasrange = 0;
 573                 next;
 574             }
 575
 576             # ranges
 577             if ($chunk =~ /^\d{1,3}\.\.\d{1,3}$/) {
 578                 my ($low, $high) = trash_add_range($chunk, $except, \%lastrange);
 579                 $lastwasrange = 1;
 580                 %lastrange = (low => $low, high => $high)
 581                   unless $except;
 582                 next;
 583             }
 584
 585             # 'except'
 586             if ($chunk eq 'except') {
 587                 die "Keyword 'except' can only follow a range (line $.)\n"
 588                   unless $lastwasrange;
 589                 die "Keyword 'except' may only occur once per line (line $.)\n"
 590                   if $except;
 591                 $except = 1;
 592                 next;
 593             }
 594
 595             die "Unknown chunk $chunk in .trashtags file (line $.)\n";
 596         }
 597     }
 598
 599     # remove original id sequence tag from trash hash if we know it
 600     trash_add($conf->{'original-tag'}, 1)
 601       if ($conf->{'original-tag'} and $conf->{trash}{ $conf->{'original-tag'} });
 602 }
 603
 604 sub trash_add_range {
 605     my ($chunk, $except, $range) = @_;
 606     my ($low,$high) = split /\.\./, $chunk;
 607     die "Ranges must be 'low..high' ($low is greater than $high on line $.)\n"
 608       if ($low > $high);
 609     if ($except) {
 610         die "Exception ranges must be within last addition range (line $.)\n"
 611           if ($low < $range->{low} or $high > $range->{high});
 612     }
 613     for my $tag ($low..$high) {
 614         trash_add($tag, $except)
 615     }
 616     return $low, $high;
 617 }
 618
 619 sub trash_add {
 620     my ($tag, $except) = @_;
 621     my $trash = $conf->{trash};
 622
 623     die "Trash values must be valid tags (000-999)\n"
 624       unless ($tag >= 0 and $tag <= 999);
 625
 626     if ($except) {
 627         delete $trash->{$tag};
 628     } else {
 629         die "Trash tag '$tag' specified twice (line $.)\n"
 630           if $trash->{$tag};
 631         $trash->{$tag} = 1;
 632     }
 633 }
 634
 635 #-----------------------------------------------------------------------
 636
 637 =head2 initialize
 638
 639 Performs boring script initialization. Handles argument parsing,
 640 mostly.
 641
 642 =cut
 643
 644 sub initialize {
 645     my ($c) = @_;
 646     my @missing = ();
 647
 648     # set mode on existing filehandles
 649     binmode(STDIN, ':utf8');
 650
 651     my $rc = GetOptions( $c,
 652                          'autoscrub|a',
 653                          'exception|x=s',
 654                          'output|o=s',
 655                          'prefix|p=s',
 656                          'nocollapse|n',
 657                          'renumber-from|rf=i',
 658                          'renumber-tag|rt=i',
 659                          'renumber-subfield|rs=s',
 660                          'original-tag|ot=i',
 661                          'original-subfield|os=s',
 662                          'script',
 663                          'strip-nines',
 664                          'trashfile|t=s',
 665                          'trashhelp',
 666                          'help|h',
 667                        );
 668     show_help() unless $rc;
 669     show_help() if ($c->{help});
 670     show_trashhelp() if ($c->{trashhelp});
 671
 672     # defaults
 673     my $pfx = $c->{prefix} // "bibs";
 674     $c->{output} = join('.',$c->{prefix},'clean','marc','xml');
 675     $c->{exception} = join('.',$c->{prefix},'exception','marc','xml');
 676     $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'};
 677     $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'};
 678     $c->{window} = 5;
 679
 680     # autotrash 901, 903 if strip-nines
 681     if ($c->{'strip-nines'}) {
 682         $c->{trash}{901} = 1;
 683         $c->{trash}{903} = 1;
 684     }
 685
 686     my @keys = keys %{$c};
 687     show_help() unless (@ARGV and @keys);
 688 }
 689
 690 sub show_help {
 691     print <<HELP;
 692 Usage is: marc-cleanup [OPTIONS] <filelist>
 693 Options
 694   --output     -o  Cleaned MARCXML output filename
 695   --exception  -x  Exception (dumped records) MARCXML filename
 696        or
 697   --prefix=<PREFIX>>   -p  Shared prefix for output/exception files. Will produce
 698                            PREFIX.clean.marc.xml and PREFIX.exception.marc.xml
 699
 700   --renumber-from     -rf  Begin renumbering id sequence with this number
 701   --renumber-tag      -rt  Tag to use in renumbering (default: 903)
 702   --renumber-subfield -rs  Subfield code to use in renumbering (default: a)
 703   --original-tag      -ot  Original id tag; will be kept in output even if
 704                            it appears in the trash file
 705   --original-subfield -os  Original id subfield code. If this is specified
 706                            and renumbering is in effect, an old-to-new mapping
 707                            file (old2new.map) will be generated.
 708
 709   --autoscrub  -a  Automatically remove non-numeric tags in data
 710   --nocollapse -n  Don't compress records to one line on output
 711   --strip-nines    Automatically remove any existing 901/903 tags in data
 712   --trashfile  -t  File containing trash tag data (see --trashhelp)
 713
 714
 715   --script         Store human-initiated ops in scriptfile (.mcscript)
 716                    Not yet implemented
 717 HELP
 718 exit;
 719 }
 720
 721 sub show_trashhelp {
 722     print <<HELP;
 723 The marc-cleanup trash tags file is a simple plaintext file. It is a
 724 line oriented format. There are three basic tokens:
 725
 726   * The tag
 727   * The tag range
 728   * The "except" clause
 729
 730 Any number of tags and/or tag ranges can appear on a single line. A
 731 tag cannot appear twice in the file, either alone or as part of a
 732 range. This is to prevent errors in the trash tag listing. Items do
 733 not have to be sorted within a line. These following lines are valid:
 734
 735   850 852 870..879 886 890 896..899
 736   214 696..699 012
 737
 738 Ranges must be ordered internally. That is, "870..879" is valid while
 739 "879..870" is not.
 740
 741 Finally, there can be only one "except" clause on a line. It is
 742 composed of the word "except" followed by one or more tags or
 743 ranges. Except clauses must follow a range, and all tags within the
 744 clause must be within the range which the clause follows.
 745
 746   900..997 except 935 950..959 987 994
 747
 748 is a valid example.
 749 HELP
 750 exit;
 751 }