$| = 1;
# initialization and setup
-my $conf = {};
-initialize($conf);
+my $c = {};
+initialize($c);
# set up files, since everything appears to be in order
-my $marcfile = shift || 'incoming.marc.xml';
-open MARC, '<:utf8', $marcfile
+open MARC, '<:utf8', $c->{marcfile}
or die "Can't open input file $!\n";
-open my $NUMARC, '>:utf8', $conf->{output}
+open my $NUMARC, '>:utf8', $c->{output}
or die "Can't open output file $!\n";
open my $OLD2NEW, '>', 'old2new.map'
- if ($conf->{'renumber-from'} and $conf->{'original-tag'});
+ if ($c->{'renumber-from'} and $c->{'original-tag'});
my $EXMARC = 'EX';
print $NUMARC "<collection xmlns=\"http://www.loc.gov/MARC21/slim\">\n";
-$conf->{totalrecs} = `grep -c '<record' $marcfile`;
-chomp $conf->{totalrecs};
-$conf->{percent} = 0;
+$c->{totalrecs} = `grep -c '<record' $c->{marcfile}`;
+chomp $c->{totalrecs};
+$c->{percent} = 0;
my @record; # current record storage
my %recmeta; # metadata about current record
my $sidx = 0;
while ( buildrecord() ) {
- unless ($conf->{ricount} % 50) {
- $conf->{percent} = int(($conf->{ricount} / $conf->{totalrecs}) * 100);
- print "\rWorking (",$conf->{percent},"%) ", $spinner[$sidx];
+ unless ($c->{ricount} % 50) {
+ $c->{percent} = int(($c->{ricount} / $c->{totalrecs}) * 100);
+ print "\rWorking (",$c->{percent},"%) ", $spinner[$sidx];
$sidx = ($sidx == $#spinner) ? 0 : $sidx + 1;
}
$ptr = 0;
until ($ptr == $#record) {
# get datafield/tag data if we have it
- $rc = stow_record_data() if ($conf->{'renumber-from'} and $conf->{'original-tag'});
+ $rc = stow_record_data() if ($c->{'renumber-from'} and $c->{'original-tag'});
return $rc if $rc;
# naked ampersands
my $match = $1;
# tags must be numeric
if ($match =~ /\D/) {
- edit("Non-numerics in tag") unless $conf->{autoscrub};
+ edit("Non-numerics in tag") unless $c->{autoscrub};
next;
}
}
write_record($NUMARC);
}
print $NUMARC "</collection>\n";
-print $OUT "\nDone. ",$conf->{ricount}," in; ",$conf->{rocount}," dumped \n";
+print $OUT "\nDone. ",$c->{ricount}," in; ",$c->{rocount}," dumped \n";
#-----------------------------------------------------------------------------------
sub stow_record_data {
# get tag data if we're looking at it
my $tag = 0;
- if ($record[$ptr] =~ m/<(control|data)field tag="(?<TAG>.{3})"/) {
- $recmeta{tag} = $+{TAG};
+ if ($record[$ptr] =~ m/<(control|data)field tag="(.{3})"/) {
+ $recmeta{tag} = $1;
$tag = $recmeta{tag};
- $record[$ptr] =~ m/ind1="(?<IND1>.)"/;
- $recmeta{ind1} = $+{IND1} || '';
- $record[$ptr] =~ m/ind2="(?<IND2>.)"/;
- $recmeta{ind2} = $+{IND2} || '';
+ $record[$ptr] =~ m/ind1="(.)"/;
+ $recmeta{ind1} = $1 || '';
+ $record[$ptr] =~ m/ind2="(.)"/;
+ $recmeta{ind2} = $1 || '';
unless ($tag) {
message("Autokill record: no detectable tag");
}
# and since we are looking at a tag, see if it's the original id
- if ($tag == $conf->{'original-tag'}) {
+ if ($tag == $c->{'original-tag'}) {
my $oid = 0;
if ($tag < 10) {
# controlfield
if ($record[$ptr] =~ m|<controlfield tag="$tag">(.+?)</controlfield>|)
{ $oid = $1; print $OLD2NEW "$oid\t", $recmeta{nid}, "\n" }
- } elsif ($tag >= 10 and $conf->{'original-subfield'}) {
+ } elsif ($tag >= 10 and $c->{'original-subfield'}) {
# datafield
my $line = $record[$ptr]; my $lptr = $ptr;
- my $osub = $conf->{'original-subfield'};
+ my $osub = $c->{'original-subfield'};
# skim to end of this tag
until ($line =~ m|</datafield>|) {
if ($line =~ /<subfield code="$osub">(.+?)</)
}
# got it; write to old->new map file
- if ($conf->{'renumber-from'} and $conf->{'original-subfield'}) {
+ if ($c->{'renumber-from'} and $c->{'original-subfield'}) {
}
}
sub edit {
my ($msg) = @_;
- return if $conf->{trash}->has( $recmeta{tag} );
- if ( $conf->{fullauto} )
+ return if $c->{trash}->has( $recmeta{tag} );
+ if ( $c->{fullauto} )
{ dump_record($msg); return }
- $conf->{editmsg} = $msg;
+ $c->{editmsg} = $msg;
print_fullcontext();
# stow original problem line
sub buildrecord {
my $l = '';
my $istrash = 0;
- my $trash = $conf->{trash};
+ my $trash = $c->{trash};
$l = <MARC> while (defined $l and $l !~ /<record>/);
return $l unless defined $l;
- $conf->{ricount}++;
+ $c->{ricount}++;
for (keys %recmeta) { $recmeta{$_} = undef }
for (0 .. @record) { delete $record[$_] }
next;
}
if ($l =~ m/<datafield tag="(.{3})"/) {
- if ($trash->has($1) or ($conf->{autoscrub} and $1 =~ /\D/))
+ if ($trash->has($1) or ($c->{autoscrub} and $1 =~ /\D/))
{ $istrash = 1; next }
}
}
# add 903(?) with new record id
- if ($conf->{'renumber-from'}) {
- $recmeta{nid} = $conf->{'renumber-from'};
- push @record, join('', ' <datafield tag="', $conf->{'renumber-tag'},
+ if ($c->{'renumber-from'}) {
+ $recmeta{nid} = $c->{'renumber-from'};
+ push @record, join('', ' <datafield tag="', $c->{'renumber-tag'},
'" ind1=" " ind2=" "> <subfield code="',
- $conf->{'renumber-subfield'},
+ $c->{'renumber-subfield'},
'">',
$recmeta{nid},
"</subfield></datafield>\n");
- $conf->{'renumber-from'}++;
+ $c->{'renumber-from'}++;
}
$i++;
if ($FH eq 'EX') {
$EXMARC = undef;
- open $EXMARC, '>:utf8', $conf->{exception}
+ open $EXMARC, '>:utf8', $c->{exception}
or die "Can't open exception file $!\n";
$FH = $EXMARC;
}
if(defined $recmeta{explanation});
# scrub newlines (unless told not to or writing exception record)
- unless ($conf->{nocollapse} or $FH eq $EXMARC)
+ unless ($c->{nocollapse} or $FH eq $EXMARC)
{ s/\n// for (@record) }
# actually write the record
print $FH @record,"\n";
# increment output record count (if not exception)
- $conf->{rocount}++ if ($FH eq $EXMARC);
+ $c->{rocount}++ if ($FH eq $EXMARC);
# if we were dumping to exception file, nuke the record and set ptr
# to terminate processing loop
sub print_fullcontext {
print $OUT "\r", ' ' x 72, "\n";
- print $OUT $conf->{editmsg},"\n";
+ print $OUT $c->{editmsg},"\n";
print $OUT "\r Tag:",$recmeta{tag}, " Ind1:'",
$recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'";
- print $OUT " @ ", $conf->{ricount}, "/", $conf->{totalrecs};
+ print $OUT " @ ", $c->{ricount}, "/", $c->{totalrecs};
print_context();
return 0;
}
sub print_context {
- my $upper = int($conf->{window} / 2) + 1;
- my $lower = int($conf->{window} / 2) - 1;
+ my $upper = int($c->{window} / 2) + 1;
+ my $lower = int($c->{window} / 2) - 1;
my $start = ($ptr - $upper < 0) ? 0 : $ptr - $upper;
my $stop = ($ptr + $lower > $#record) ? $#record : $ptr + $lower;
print $OUT "\n";
sub message {
my ($msg) = @_;
- print $OUT "\r$msg at ",$conf->{ricount},"/",$conf->{totalrecs}, "\n";
+ print $OUT "\r$msg at ",$c->{ricount},"/",$c->{totalrecs}, "\n";
}
#-----------------------------------------------------------------------------------
{ print $OUT "No killed line to yank\n"; return }
my @a = @record[0 .. $ptr - 1];
my @b = @record[$ptr .. $#record];
- @record = (@a, $conf->{killline}, @b);
+ @record = (@a, $c->{killline}, @b);
@a = undef; @b = undef;
print_context();
return 0;
sub commit_edit { return 1 }
sub widen_window {
- if ($conf->{window} == 15)
+ if ($c->{window} == 15)
{ print $OUT "Window can't be bigger than 15 lines\n"; return }
- $conf->{window} += 2;
+ $c->{window} += 2;
print_context;
}
sub narrow_window {
- if ($conf->{window} == 5)
+ if ($c->{window} == 5)
{ print $OUT "Window can't be smaller than 5 lines\n"; return }
- $conf->{window} -= 2;
+ $c->{window} -= 2;
print_context;
}
'fullauto',
'exception|x=s',
'output|o=s',
+ 'marcfile|m=s',
'prefix|p=s',
'nocollapse|n',
'renumber-from|rf=i',
'trashhelp',
'help|h',
);
- show_help() unless $rc and @ARGV;
+ show_help() unless $rc;
show_help() if ($c->{help});
show_trashhelp() if ($c->{trashhelp});
# defaults
- my $pfx = $c->{prefix} // "bibs";
+ my $pfx = defined($c->{prefix}) ? $c->{prefix} : "bibs";
$c->{ricount} = 0;
$c->{rocount} = 0;
- $c->{output} = join('.',$c->{prefix},'clean','marc','xml');
- $c->{exception} = join('.',$c->{prefix},'exception','marc','xml');
$c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'};
$c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'};
$c->{window} = 9;
+ if ($c->{prefix}) {
+ $c->{output} = join('.',$c->{prefix},'clean','marc','xml')
+ unless $c->{output};
+ $c->{exception} = join('.',$c->{prefix},'exception','marc','xml')
+ unless $c->{exception};
+ $c->{marcfile} = $c->{prefix} . '.marc.xml'
+ unless $c->{marcfile};
+ }
+ show_help() unless ($c->{marcfile} and $c->{output});
if ($c->{trashfile}) {
- $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $conf->{trashfile})
+ $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $c->{trashfile})
} else {
$c->{trash} = Equinox::Migration::SimpleTagList->new;
}
# remove original id sequence tag from trash hash if we know it
$c->{trash}->remove_tag($c->{'original-tag'})
if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) );
-
- my @keys = keys %{$c};
- show_help() unless (@ARGV and @keys);
}
sub show_help {
print <<HELP;
-Usage is: marc-cleanup [OPTIONS] <filelist>
+Usage is: marc_cleanup [OPTIONS] <filelist>
Options
--output -o Cleaned MARCXML output filename
--exception -x Exception (dumped records) MARCXML filename
or
- --prefix=<PREFIX>> -p Shared prefix for output/exception files. Will produce
+ --prefix=<PREFIX> -p Shared prefix for output/exception files. Will produce
PREFIX.clean.marc.xml and PREFIX.exception.marc.xml
+ --marcfile -m Input filename. Defaults to PREFIX.marc.xml
+
--renumber-from -rf Begin renumbering id sequence with this number
--renumber-tag -rt Tag to use in renumbering (default: 903)
--renumber-subfield -rs Subfield code to use in renumbering (default: a)
--fullauto No manual edits. All problematic records dumped to
exception file.
- --script Store human-initiated ops in scriptfile (.mcscript)
- Not yet implemented
HELP
exit;
}