use Getopt::Long;
use Term::ReadLine;
+use Equinox::Migration::SimpleTagList;
-binmode STDOUT, ":utf8";
my $term = new Term::ReadLine 'yaz-cleanup';
my $OUT = $term->OUT || \*STDOUT;
+binmode STDOUT, ":utf8";
+binmode $OUT, ":utf8";
$| = 1;
# initialization and setup
my $conf = {};
initialize($conf);
-populate_trash() if ($conf->{trashfile});
# set up files, since everything appears to be in order
my $marcfile = shift || 'incoming.marc.xml';
open my $OLD2NEW, '>', 'old2new.map'
if ($conf->{'renumber-from'} and $conf->{'original-subfield'});
my $EXMARC = 'EX';
-print $NUMARC "<collection>\n";
+print $NUMARC "<collection xmlns=\"http://www.loc.gov/MARC21/slim\">\n";
$conf->{totalrecs} = `grep -c '<record' $marcfile`;
chomp $conf->{totalrecs};
$conf->{percent} = 0;
-my @record = (); # current record storage
-my %recmeta = (); # metadata about current record
-my $ptr = 0; # record index pointer
+my @record; # current record storage
+my %recmeta; # metadata about current record
+my $ptr = 0; # record index pointer
# this is the dispatch table which drives command selection in
# edit(), below
edit("Non-numerics in tag") unless $conf->{autoscrub};
next;
}
- # test for existing 901/903 unless we're autocleaning them
- unless ($conf->{'strip9'} or $conf->{'no-strip9'}) {
- if ($match == 901 or $match == 903) {
- edit("Incoming 901/903 found in data");
- next;
- }
- }
}
# subfields can't be non-alphanumeric
write_record($NUMARC);
}
print $NUMARC "</collection>\n";
-print $OUT "\nDone. \n";
+print $OUT "\nDone. ",$conf->{ricount}," in / ",$conf->{rocount}," out \n";
#-----------------------------------------------------------------------------------
sub do_automated_cleanups {
$ptr = 0;
until ($ptr == $#record) {
+
# catch empty datafield elements
if ($record[$ptr] =~ m/<datafield tag="..."/) {
if ($record[$ptr + 1] =~ m|</datafield>|) {
my @a = @record[0 .. $ptr - 1];
my @b = @record[$ptr + 2 .. $#record];
@record = (@a, @b);
+ @a = undef; @b = undef;
message("Empty datafield scrubbed");
$ptr = 0;
next;
my @a = @record[0 .. $ptr - 1];
my @b = @record[$ptr + 1 .. $#record];
@record = (@a, @b);
+ @a = undef; @b = undef;
message("Empty subfield scrubbed");
$ptr = 0;
next;
message("Dollar sign corrected");
}
- # clean up tags with spaces in them
- $record[$ptr] =~ s/tag=" /tag="00/g;
- $record[$ptr] =~ s/tag=" /tag="0/g;
- $record[$ptr] =~ s/tag="-/tag="0/g;
- $record[$ptr] =~ s/tag="(\d\d) /tag="0$1/g;
-
# automatable subfield maladies
$record[$ptr] =~ s/code=" ">c/code="c">/;
$record[$ptr] =~ s/code=" ">\$/code="c">\$/;
my $osub = $conf->{'original-subfield'};
$recmeta{oid} = 'NONE';
- until ($line =~ m|</record>|) {
+ # skim to end of this tag
+ until ($line =~ m|</datafield>|) {
if ($line =~ /<subfield code="$osub">(.+?)</)
{ $recmeta{oid} = $1 }
$lptr++;
sub edit {
my ($msg) = @_;
- return if $conf->{trash}{ $recmeta{tag} };
+ return if $conf->{trash}->has( $recmeta{tag} );
$conf->{editmsg} = $msg;
print_fullcontext();
sub buildrecord {
my $l = '';
+ my $istrash = 0;
+ my $trash = $conf->{trash};
+
$l = <MARC> while (defined $l and $l !~ /<record>/);
return $l unless defined $l;
- @record = ();
- %recmeta = ();
$conf->{ricount}++;
- until ($l =~ m|</record>|)
- { push @record, $l; $l = <MARC>; }
- push @record, $l;
+ for (keys %recmeta) { $recmeta{$_} = undef }
+ for (0 .. @record) { delete $record[$_] }
+
+ my $i = 0;
+ until ($l =~ m|</record>|) {
+ # clean up tags with spaces in them
+ $l =~ s/tag=" /tag="00/g;
+ $l =~ s/tag=" /tag="0/g;
+ $l =~ s/tag="-/tag="0/g;
+ $l =~ s/tag="(\d\d) /tag="0$1/g;
+
+ # excise unwanted tags
+ if ($istrash) {
+ $istrash = 0 if ($l =~ m|</datafield|);
+ $l = <MARC>;
+ next;
+ }
+ if ($l =~ m/<datafield tag="(.{3})"/) {
+ if ($trash->has($1) or ($conf->{autoscrub} and $1 =~ /\D/))
+ { $istrash = 1; next }
+ }
+
+ $record[$i] = $l;
+ $l = <MARC>;
+ $i++;
+ }
+ $record[$i] = $l;
return 1;
}
sub write_record {
my ($FH) = @_;
- my $trash = $conf->{trash};
if ($FH eq 'EX') {
$EXMARC = undef;
print $FH '<!-- ', $recmeta{explanation}, " -->\n"
if(defined $recmeta{explanation});
- # excise unwanted tags
- if (keys %{$trash} or $conf->{autoscrub}) {
- my @trimmed = ();
- my $istrash = 0;
- for my $line (@record) {
- if ($istrash) {
- $istrash = 0 if $line =~ m|</datafield|;
- next;
- }
- if ($line =~ m/<datafield tag="(.{3})"/) {
- my $tag = $1;
- if ($trash->{$tag} or ($conf->{autoscrub} and $tag =~ /\D/)) {
- $istrash = 1;
- next
- }
- }
- push @trimmed, $line;
- }
- @record = @trimmed;
- }
-
# add 903(?) with new record id
my $renumber = '';
if ($conf->{'renumber-from'}) {
'" ind1=" " ind2=" "> <subfield code="',
$conf->{'renumber-subfield'},
'">', $recmeta{nid}, "</subfield></datafield>\n");
- my @tmp = @record[0 .. $#record - 1];
- my $last = $record[$#record];
+ my @tmp = @record[0 .. @record - 2];
+ my $last = $record[-1];
+ @record = undef;
@record = (@tmp, $renumber, $last);
+ @tmp = undef; $last = undef;
$conf->{'renumber-from'}++;
}
$recmeta{prevline} = $record[$ptr];
$record[$ptr] =~ s/$from/$to/;
+ $ofrom = undef; $to = undef; $from = undef;
print_context();
return 0;
}
my $temp = $record[$ptr];
$record[$ptr] = $recmeta{prevline};
$recmeta{prevline} = $temp;
+ $temp = undef;
print_context();
return 0;
}
my @a = @record[0 .. $ptr - 1];
my @b = @record[$ptr + 1 .. $#record];
@record = (@a, @b);
+ @a = undef; @b = undef;
print_context();
return 0;
}
my @a = @record[0 .. $ptr - 1];
my @b = @record[$ptr .. $#record];
@record = (@a, $conf->{killline}, @b);
+ @a = undef; @b = undef;
print_context();
return 0;
}
my (@explanation) = @_;
print $OUT @explanation;
$recmeta{explanation} = join(' ', 'Tag', $recmeta{tag}, @explanation);
+ @explanation = undef;
write_record($EXMARC);
return 1;
}
sub quit { exit }
-#-----------------------------------------------------------------------------------
-# populate_trash
-#-----------------------------------------------------------------------------------
-# defined a domain-specific language for specifying MARC tags to be dropped from
-# records during processing. it is line oriented, and is specified as follows:
-#
-# each line may specify any number of tags to be included, either singly (\d{1,3})
-# or as a range (\d{1,3}\.\.\d{1,3}
-#
-# if a single number is given, it must be between '000' and '999', inclusive.
-#
-# ranges obey the previous rule, and also the first number of the range must be less
-# than the second number
-#
-# finally, any single range in a line may be followed by the keyword 'except'. every
-# number or range after 'except' is excluded from the range specified. all these
-# numbers must actually be within the range.
-#
-# specifying a tag twice is an error, to help prevent typos
-
-sub populate_trash {
- print $OUT ">>> TRASHTAGS FILE FOUND. LOADING TAGS TO BE STRIPPED FROM OUTPUT\n";
- open TRASH, '<', $conf->{trashfile}
- or die "Can't open trash tags file!\n";
- while (<TRASH>) {
- my $lastwasrange = 0;
- my %lastrange = ( high => 0, low => 0);
- my $except = 0;
-
- my @chunks = split /\s+/;
- while (my $chunk = shift @chunks) {
-
- # single values
- if ($chunk =~ /^\d{1,3}$/) {
- trash_add($chunk, $except);
- $lastwasrange = 0;
- next;
- }
-
- # ranges
- if ($chunk =~ /^\d{1,3}\.\.\d{1,3}$/) {
- my ($low, $high) = trash_add_range($chunk, $except, \%lastrange);
- $lastwasrange = 1;
- %lastrange = (low => $low, high => $high)
- unless $except;
- next;
- }
-
- # 'except'
- if ($chunk eq 'except') {
- die "Keyword 'except' can only follow a range (line $.)\n"
- unless $lastwasrange;
- die "Keyword 'except' may only occur once per line (line $.)\n"
- if $except;
- $except = 1;
- next;
- }
-
- die "Unknown chunk $chunk in .trashtags file (line $.)\n";
- }
- }
-
- # remove original id sequence tag from trash hash if we know it
- trash_add($conf->{'original-tag'}, 1)
- if ($conf->{'original-tag'} and $conf->{trash}{ $conf->{'original-tag'} });
-}
-
-sub trash_add_range {
- my ($chunk, $except, $range) = @_;
- my ($low,$high) = split /\.\./, $chunk;
- die "Ranges must be 'low..high' ($low is greater than $high on line $.)\n"
- if ($low > $high);
- if ($except) {
- die "Exception ranges must be within last addition range (line $.)\n"
- if ($low < $range->{low} or $high > $range->{high});
- }
- for my $tag ($low..$high) {
- trash_add($tag, $except)
- }
- return $low, $high;
-}
-
-sub trash_add {
- my ($tag, $except) = @_;
- my $trash = $conf->{trash};
-
- die "Trash values must be valid tags (000-999)\n"
- unless ($tag >= 0 and $tag <= 999);
-
- if ($except) {
- delete $trash->{$tag};
- } else {
- die "Trash tag '$tag' specified twice (line $.)\n"
- if $trash->{$tag};
- $trash->{$tag} = 1;
- }
-}
-
#-----------------------------------------------------------------------
=head2 initialize
'original-tag|ot=i',
'original-subfield|os=s',
'script',
- 'strip9',
'no-strip9',
'trashfile|t=s',
'trashhelp',
'help|h',
);
- show_help() unless $rc;
+ show_help() unless $rc and @ARGV;
show_help() if ($c->{help});
show_trashhelp() if ($c->{trashhelp});
$c->{exception} = join('.',$c->{prefix},'exception','marc','xml');
$c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'};
$c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'};
- $c->{window} = 5;
+ $c->{window} = 9;
- # autotrash 901, 903 if strip-nines
- if ($c->{'strip9'}) {
- $c->{trash}{901} = 1;
- $c->{trash}{903} = 1;
+ if ($c->{trashfile}) {
+ $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $conf->{trashfile})
+ } else {
+ $c->{trash} = Equinox::Migration::SimpleTagList->new;
}
+ # autotrash 901, 903 unless no strip-nines
+ unless ($c->{'no-strip9'}) {
+ $c->{trash}->add_tag(901);
+ $c->{trash}->add_tag(903);
+ }
+ # remove original id sequence tag from trash hash if we know it
+ $c->{trash}->remove_tag($c->{'original-tag'})
+ if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) );
my @keys = keys %{$c};
show_help() unless (@ARGV and @keys);
--autoscrub -a Automatically remove non-numeric tags in data
--nocollapse -n Don't compress records to one line on output
- --strip9 Automatically remove any existing 901/903 tags in data (reversible)
+ --no-strip9 Don't autoremove 901/903 tags in data
--trashfile -t File containing trash tag data (see --trashhelp)
sub show_trashhelp {
print <<HELP;
-The marc-cleanup trash tags file is a simple plaintext file. It is a
-line oriented format. There are three basic tokens:
-
- * The tag
- * The tag range
- * The "except" clause
-
-Any number of tags and/or tag ranges can appear on a single line. A
-tag cannot appear twice in the file, either alone or as part of a
-range. This is to prevent errors in the trash tag listing. Items do
-not have to be sorted within a line. These following lines are valid:
-
- 850 852 870..879 886 890 896..899
- 214 696..699 012
-
-Ranges must be ordered internally. That is, "870..879" is valid while
-"879..870" is not.
-
-Finally, there can be only one "except" clause on a line. It is
-composed of the word "except" followed by one or more tags or
-ranges. Except clauses must follow a range, and all tags within the
-clause must be within the range which the clause follows.
+See
- 900..997 except 935 950..959 987 994
+http://intra.lan.hq.esilibrary.com/dokuwiki/doku.php?id=migration:tag_files
-is a valid example.
+for tag file syntax information.
HELP
exit;
}