#!/usr/bin/perl
+require 5.10.0;
use strict;
use warnings;
use Getopt::Long;
use Term::ReadLine;
+binmode STDOUT, ":utf8";
my $term = new Term::ReadLine 'yaz-cleanup';
my $OUT = $term->OUT || \*STDOUT;
populate_trash() if ($conf->{trashfile});
# set up files, since everything appears to be in order
-open MARC, '<:utf8', (shift || 'incoming.marc.xml')
+my $marcfile = shift || 'incoming.marc.xml';
+open MARC, '<:utf8', $marcfile
or die "Can't open input file $!\n";
open my $NUMARC, '>:utf8', $conf->{output}
or die "Can't open output file $!\n";
open my $OLD2NEW, '>', 'old2new.map'
if ($conf->{'renumber-from'} and $conf->{'original-subfield'});
my $EXMARC = 'EX';
+print $NUMARC "<collection>\n";
+$conf->{totalrecs} = `grep -c '<record' $marcfile`;
+chomp $conf->{totalrecs};
+$conf->{percent} = 0;
my @record = (); # current record storage
my %recmeta = (); # metadata about current record
help => \&help,
);
-my @spinner = qw(- / | \\);
+my @spinner = qw(- \\ | /);
my $sidx = 0;
while ( buildrecord() ) {
- unless ($conf->{ricount} % 100) {
- print "\rWorking... ", $spinner[$sidx];
+ unless ($conf->{ricount} % 50) {
+ $conf->{percent} = int(($conf->{ricount} / $conf->{totalrecs}) * 100);
+ print "\rWorking (",$conf->{percent},"%) ", $spinner[$sidx];
$sidx = ($sidx == $#spinner) ? 0 : $sidx + 1;
}
- do_automated_cleanups();
+ my $rc = do_automated_cleanups();
+ next if $rc;
$ptr = 0;
until ($ptr == $#record) {
+ # get datafield/tag data if we have it
+ my $rc = stow_record_data();
+ return $rc if $rc;
+
# naked ampersands
if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/)
{ edit("Naked ampersand"); $ptr= 0; next }
next;
}
# test for existing 901/903 unless we're autocleaning them
- unless ($conf->{'strip-nines'}) {
+ unless ($conf->{'strip9'} or $conf->{'no-strip9'}) {
if ($match == 901 or $match == 903) {
edit("Incoming 901/903 found in data");
next;
# subfields can't be non-alphanumeric
if ($record[$ptr] =~ /<subfield code="(.*?)"/) {
- my $match = $1;
- if ($match =~ /\P{IsAlnum}/ or $match eq '') {
+ if ($1 =~ /\P{IsAlnum}/ or $1 eq '') {
edit("Junk in subfield code/Null subfield code");
next;
}
}
+ # subfields can't be non-alphanumeric
+ if ($record[$ptr] =~ /<subfield code="(\w{2,})"/) {
+ edit("Subfield code larger than 1 char");
+ next;
+ }
$ptr++;
}
write_record($NUMARC);
}
-#print $NUMARC "</collection>\n";
+print $NUMARC "</collection>\n";
print $OUT "\nDone. \n";
sub do_automated_cleanups {
$ptr = 0;
until ($ptr == $#record) {
- # get datafield/tag data if we have it
- stow_record_data();
-
# catch empty datafield elements
if ($record[$ptr] =~ m/<datafield tag="..."/) {
if ($record[$ptr + 1] =~ m|</datafield>|) {
# automatable subfield maladies
$record[$ptr] =~ s/code=" ">c/code="c">/;
- $record[$ptr] =~ s/code=" ">\$/code="c"$>/;
+ $record[$ptr] =~ s/code=" ">\$/code="c">\$/;
}
+ return 0;
}
sub stow_record_data {
# get tag data if we're looking at it
-
if ($record[$ptr] =~ m/<datafield tag="(?<TAG>.{3})"/) {
$recmeta{tag} = $+{TAG};
$record[$ptr] =~ m/ind1="(?<IND1>.)"/;
$recmeta{ind1} = $+{IND1} || '';
$record[$ptr] =~ m/ind2="(?<IND2>.)"/;
$recmeta{ind2} = $+{IND2} || '';
-
+
unless (defined $recmeta{tag}) {
message("Autokill record: no detectable tag");
dump_record("No detectable tag") ;
+ return 1;
}
# and since we are looking at a tag, see if it's the original id
- if ($conf->{'original-subfield'} and
- $recmeta{tag} == $conf->{'original-tag'}) {
+ if ($conf->{'original-subfield'} and $recmeta{tag} == $conf->{'original-tag'}) {
my $line = $record[$ptr]; my $lptr = $ptr;
my $osub = $conf->{'original-subfield'};
$recmeta{oid} = 'NONE';
until ($line =~ m|</record>|) {
+ if ($line =~ /<subfield code="$osub">(.+?)</)
+ { $recmeta{oid} = $1 }
$lptr++;
$line = $record[$lptr];
- $recmeta{oid} = $+{TAG}
- if ($line =~ /<subfield code="$osub">(.+?)</);
+ }
+ unless (defined $recmeta{oid}) {
+ message("Autokill record: no oldid when old2new mapping requested");
+ dump_record("No old id found");
+ return 1;
}
}
}
+ return 0;
}
#-----------------------------------------------------------------------------------
# write to old->new map file if needed
if ($conf->{'renumber-from'} and $conf->{'original-subfield'}) {
- unless (defined $recmeta{oid}) {
- my $msg = join(' ', "No old id num found");
- dump_record($msg);
- } else {
- print $OLD2NEW $recmeta{oid}, "\t", $recmeta{nid}, "\n"
- }
+ print $OLD2NEW $recmeta{oid}, "\t", $recmeta{nid}, "\n"
}
# actually write the record
'original-tag|ot=i',
'original-subfield|os=s',
'script',
- 'strip-nines',
+ 'strip9',
+ 'no-strip9',
'trashfile|t=s',
'trashhelp',
'help|h',
show_trashhelp() if ($c->{trashhelp});
# defaults
- if ($c->{prefix}) {
- $c->{output} = join('.',$c->{prefix},'marc','xml');
- $c->{exception} = join('.',$c->{prefix},'marc','ex');
- }
+ my $pfx = $c->{prefix} // "bibs";
+ $c->{output} = join('.',$c->{prefix},'clean','marc','xml');
+ $c->{exception} = join('.',$c->{prefix},'exception','marc','xml');
$c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'};
$c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'};
$c->{window} = 5;
# autotrash 901, 903 if strip-nines
- if ($c->{'strip-nines'}) {
+ if ($c->{'strip9'}) {
$c->{trash}{901} = 1;
$c->{trash}{903} = 1;
}
--output -o Cleaned MARCXML output filename
--exception -x Exception (dumped records) MARCXML filename
or
- --prefix=<PREFIX>> -p Shared prefix for output/exception files. Will
- produce PREFIX.marc.xml and PREFIX.ex.xml
-
- --trashfile -t File containing trash tag data (see --trashhelp)
+ --prefix=<PREFIX>> -p Shared prefix for output/exception files. Will produce
+ PREFIX.clean.marc.xml and PREFIX.exception.marc.xml
--renumber-from -rf Begin renumbering id sequence with this number
--renumber-tag -rt Tag to use in renumbering (default: 903)
and renumbering is in effect, an old-to-new mapping
file (old2new.map) will be generated.
- --nocollapse -n Don't compress records to one line on output
--autoscrub -a Automatically remove non-numeric tags in data
- --strip-nines Automatically remove any existing 901/903 tags in data
+ --nocollapse -n Don't compress records to one line on output
+ --strip9 Automatically remove any existing 901/903 tags in data (reversible)
+ --trashfile -t File containing trash tag data (see --trashhelp)
+
--script Store human-initiated ops in scriptfile (.mcscript)
Not yet implemented