use Term::ReadLine;
use Equinox::Migration::SimpleTagList;
-binmode STDOUT, ":utf8";
my $term = new Term::ReadLine 'yaz-cleanup';
my $OUT = $term->OUT || \*STDOUT;
+binmode STDOUT, ":utf8";
+binmode $OUT, ":utf8";
$| = 1;
edit("Non-numerics in tag") unless $conf->{autoscrub};
next;
}
- # test for existing 901/903 unless we're autocleaning them
- unless ($conf->{'strip9'} or $conf->{'no-strip9'}) {
- if ($match == 901 or $match == 903) {
- edit("Incoming 901/903 found in data");
- next;
- }
- }
}
# subfields can't be non-alphanumeric
write_record($NUMARC);
}
print $NUMARC "</collection>\n";
-print $OUT "\nDone. \n";
+print $OUT "\nDone. ",$conf->{ricount}," in / ",$conf->{rocount}," out \n";
#-----------------------------------------------------------------------------------
sub do_automated_cleanups {
$ptr = 0;
until ($ptr == $#record) {
+
# catch empty datafield elements
if ($record[$ptr] =~ m/<datafield tag="..."/) {
if ($record[$ptr + 1] =~ m|</datafield>|) {
message("Dollar sign corrected");
}
- # clean up tags with spaces in them
- $record[$ptr] =~ s/tag=" /tag="00/g;
- $record[$ptr] =~ s/tag=" /tag="0/g;
- $record[$ptr] =~ s/tag="-/tag="0/g;
- $record[$ptr] =~ s/tag="(\d\d) /tag="0$1/g;
-
# automatable subfield maladies
$record[$ptr] =~ s/code=" ">c/code="c">/;
$record[$ptr] =~ s/code=" ">\$/code="c">\$/;
my $osub = $conf->{'original-subfield'};
$recmeta{oid} = 'NONE';
- until ($line =~ m|</record>|) {
+ # skim to end of this tag
+ until ($line =~ m|</datafield>|) {
if ($line =~ /<subfield code="$osub">(.+?)</)
{ $recmeta{oid} = $1 }
$lptr++;
sub buildrecord {
my $l = '';
+ my $istrash = 0;
+ my $trash = $conf->{trash};
+
$l = <MARC> while (defined $l and $l !~ /<record>/);
return $l unless defined $l;
@record = ();
%recmeta = ();
$conf->{ricount}++;
- until ($l =~ m|</record>|)
- { push @record, $l; $l = <MARC>; }
+ until ($l =~ m|</record>|) {
+ # clean up tags with spaces in them
+ $l =~ s/tag=" /tag="00/g;
+ $l =~ s/tag=" /tag="0/g;
+ $l =~ s/tag="-/tag="0/g;
+ $l =~ s/tag="(\d\d) /tag="0$1/g;
+
+ # excise unwanted tags
+ if ($istrash) {
+ $istrash = 0 if ($l =~ m|</datafield|);
+ $l = <MARC>;
+ next;
+ }
+ if ($l =~ m/<datafield tag="(.{3})"/) {
+ if ($trash->has($1) or ($conf->{autoscrub} and $1 =~ /\D/))
+ { $istrash = 1; next }
+ }
+
+ push @record, $l;
+ $l = <MARC>;
+ }
push @record, $l;
return 1;
}
sub write_record {
my ($FH) = @_;
- my $trash = $conf->{trash};
if ($FH eq 'EX') {
$EXMARC = undef;
print $FH '<!-- ', $recmeta{explanation}, " -->\n"
if(defined $recmeta{explanation});
- # excise unwanted tags
- if (defined $trash or $conf->{autoscrub}) {
- my @trimmed = ();
- my $istrash = 0;
- for my $line (@record) {
- if ($istrash) {
- $istrash = 0 if $line =~ m|</datafield|;
- next;
- }
- if ($line =~ m/<datafield tag="(.{3})"/) {
- my $tag = $1;
- if ($trash->has($tag) or ($conf->{autoscrub} and $tag =~ /\D/)) {
- $istrash = 1;
- next
- }
- }
- push @trimmed, $line;
- }
- @record = @trimmed;
- }
-
# add 903(?) with new record id
my $renumber = '';
if ($conf->{'renumber-from'}) {
'original-tag|ot=i',
'original-subfield|os=s',
'script',
- 'strip9',
'no-strip9',
'trashfile|t=s',
'trashhelp',
'help|h',
);
- show_help() unless $rc;
+ show_help() unless $rc and @ARGV;
show_help() if ($c->{help});
show_trashhelp() if ($c->{trashhelp});
$c->{window} = 5;
if ($c->{trashfile}) {
- $c->{trash} = Equinox::Migration::SimpleTagList->new($conf->{trashfile})
+ $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $conf->{trashfile})
} else {
$c->{trash} = Equinox::Migration::SimpleTagList->new;
}
- # remove original id sequence tag from trash hash if we know it
- $c->{trash}->remove_tag($c->{'original-tag'})
- if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) );
-
- # autotrash 901, 903 if strip-nines
- if ($c->{'strip9'}) {
+ # autotrash 901, 903 unless no strip-nines
+ unless ($c->{'no-strip9'}) {
$c->{trash}->add_tag(901);
$c->{trash}->add_tag(903);
}
+ # remove original id sequence tag from trash hash if we know it
+ $c->{trash}->remove_tag($c->{'original-tag'})
+ if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) );
my @keys = keys %{$c};
show_help() unless (@ARGV and @keys);
--autoscrub -a Automatically remove non-numeric tags in data
--nocollapse -n Don't compress records to one line on output
- --strip9 Automatically remove any existing 901/903 tags in data
- --no-strip9 Don't complain about 901/903 tags in data
+ --no-strip9 Don't autoremove 901/903 tags in data
--trashfile -t File containing trash tag data (see --trashhelp)