my $term = new Term::ReadLine 'yaz-cleanup';
my $OUT = $term->OUT || \*STDOUT;
-my $count = 0;
-my $reccount = 0;
-my $oreccount = 0;
-my $line = '';
-my %trash = (); # hash for tags to be dumped
+# initialization and setup
+my $conf = {};
+initialize($conf);
+populate_trash() if ($conf->{trash});
-# read in trash tags file if it exists
-populate_trash() if (-e '.trashtags');
-
-my @record = (); # current record storage
+my @record = (); # current record storage
my %recmeta = (); # metadata about current record
-my @context= (); # last 5 lines of file
+my $recptr = 0; # record index pointer
my $input = shift || 'incoming.marc.xml';
-open MARC, '<', $input;
-open my $NUMARC, '>', 'incoming.clean.marc.xml';
+open MARC, '<:utf8', $input;
+open my $NUMARC, '>:utf8', $conf->{output};
print $NUMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
print $NUMARC '<collection xmlns="http://www.loc.gov/MARC21/slim">',"\n";
-open my $EXMARC, '>', 'incoming.exceptions.marc.xml';
-print $EXMARC '<?xml version="1.0" encoding="UTF-8"?>',"\n";
-print $EXMARC '<collection xmlns="http://www.loc.gov/MARC21/slim">',"\n";
-open MARC2, '<', $input;
-<MARC2>;
+open my $EXMARC, '>:utf8', $conf->{exception};
# this is the dispatch table which drives command selection in
# edit(), below
k => \&kill_line,
o => \&show_original,
m => \&merge_lines,
+ n => \&next_line,
+ p => \&prev_line,
s => \&substitute,
t => \&commit_edit,
x => \&dump_record,
my @spinner = qw(- / | \\);
my $sidx = 0;
-while (my $line = getline()) {
- unless ($count % 2000) {
+while ( buildrecord() ) {
+ unless ($conf->{ricount} % 100) {
print "\rWorking... ", $spinner[$sidx];
$sidx = ($sidx == $#spinner) ? 0 : $sidx + 1;
}
- update_linecontext();
-
- # catch empty datafield elements
- if ($line =~ m|</datafield>|) {
- if ($record[-2] =~ m/<datafield tag="..." ind1="." ind2=".">/) {
- pop @record; pop @record;
- message("Empty datafield scrubbed");
- next;
+
+ do_automated_cleanups();
+
+ $recptr = 0;
+ until ($recptr == $#record) {
+ # naked ampersands
+ if ($record[$recptr] =~ /&/ && $record[$recptr] !~ /&\w+?;/)
+ { edit("Naked ampersand"); $recptr= 0; next }
+
+ # tags must be numeric
+ if ($record[$recptr] =~ /<datafield tag="(.+?)"/) {
+ my $match = $1;
+ if ($match =~ /\D/) {
+ edit("Non-numerics in tag");
+ $recptr = 0;
+ next;
+ }
}
- }
- # pad short leaders
- if ($line =~ m|<leader>(.+?)</leader>|) {
- my $leader = $1;
- if (length $leader < 24) {
- $leader .= ' ' x (20 - length($leader));
- $leader .= "4500";
- $line = "<leader>$leader</leader>\n";
- message("Short leader padded");
+ # subfields can't be non-alphanumeric
+ if ($record[$recptr] =~ /<subfield code="(.*?)"/) {
+ my $match = $1;
+ if ($match =~ /\P{IsAlnum}/ or $match eq '') {
+ edit("Junk in subfield code/Null subfield code");
+ $recptr = 0;
+ next;
+ }
}
+ $recptr++;
}
+ write_record($NUMARC);
+}
+print $NUMARC "</collection>\n";
+print $OUT "\nDone. \n";
- # clean misplaced dollarsigns
- if ($line =~ m|<subfield code="\$">c?\d+\.\d{2}|) {
- $line =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|;
- message("Dollar sign corrected");
- }
- # clean up tags with spaces in them
- $line =~ s/tag=" /tag="00/g;
- $line =~ s/tag=" /tag="0/g;
- $line =~ s/tag="-/tag="0/g;
- $line =~ s/tag="(\d\d) /tag="0$1/g;
-
- # stow tag data if we're looking at it
- if ($line =~ m/<datafield tag="(.{3})" ind1="(.)" ind2="(.)">/) {
- $recmeta{tag} = $1;
- $recmeta{ind1} = $2;
- $recmeta{ind2} = $3;
- }
+#-----------------------------------------------------------------------------------
+# cleanup routines
+#-----------------------------------------------------------------------------------
- # and stow line back in record
- $record[-1] = $line;
+sub do_automated_cleanups {
+ $recptr = 0;
+ until ($recptr == $#record) {
+ # catch empty datafield elements
+ if ($record[$recptr] =~ m/<datafield tag="..." ind1="." ind2=".">/) {
+ if ($record[$recptr + 1] =~ m|</datafield>|) {
+ my @a = @record[0 .. $recptr - 1];
+ my @b = @record[$recptr + 2 .. $#record];
+ @record = (@a, @b);
+ message("Empty datafield scrubbed");
+ $recptr = 0;
+ next;
+ }
+ }
+ # and quasi-empty subfields
+ if ($record[$recptr] =~ m|<subfield code="(.*?)">(.*?)</sub|) {
+ my $code = $1; my $content = $2;
+ if ($code =~ /\W/ and ($content =~ /\s+/ or $content eq '')) {
+ my @a = @record[0 .. $recptr - 1];
+ my @b = @record[$recptr + 1 .. $#record];
+ @record = (@a, @b);
+ message("Empty subfield scrubbed");
+ $recptr = 0;
+ next;
+ }
+ }
+ $recptr++;
+ }
- # naked ampersands
- if ($line =~ /&/ && $line !~ /&\w+?;/)
- { edit("Naked ampersand", $line); next }
+ # single-line fixes
+ for $recptr (0 .. $#record) {
+ # pad short leaders
+ if ($record[$recptr] =~ m|<leader>(.+?)</leader>|) {
+ my $leader = $1;
+ if (length $leader < 24) {
+ $leader .= ' ' x (20 - length($leader));
+ $leader .= "4500";
+ $record[$recptr] = "<leader>$leader</leader>\n";
+ message("Short leader padded");
+ }
+ }
+ if ($record[$recptr] =~ m|<controlfield tag="008">(.+?)</control|) {
+ #pad short 008
+ my $content = $1;
+ if (length $content < 40) {
+ $content .= ' ' x (40 - length($content));
+ $record[$recptr] = "<controlfield tag=\"008\">$content</controlfield>\n";
+ message("Short 008 padded");
+ }
+ }
- # tags must be numeric
- if ($line =~ /<datafield tag="(.+?)"/) {
- my $match = $1;
- if ($match =~ /\D/) {
- edit("Non-numerics in tag", $line);
- next;
+ # clean misplaced dollarsigns
+ if ($record[$recptr] =~ m|<subfield code="\$">c?\d+\.\d{2}|) {
+ $record[$recptr] =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|;
+ message("Dollar sign corrected");
}
- }
- # subfields can't be non-alphanumeric
- if ($line =~ /<subfield code="(.+?)"/) {
- my $match = $1;
- if ($match =~ /\P{IsAlnum}/) {
- edit("Junk in subfield code", $line);
- next;
+ # clean up tags with spaces in them
+ $record[$recptr] =~ s/tag=" /tag="00/g;
+ $record[$recptr] =~ s/tag=" /tag="0/g;
+ $record[$recptr] =~ s/tag="-/tag="0/g;
+ $record[$recptr] =~ s/tag="(\d\d) /tag="0$1/g;
+
+ # stow tag data if we're looking at it
+ if ($record[$recptr] =~ m/<datafield tag="(.{3})" ind1="(.)" ind2="(.)">/) {
+ $recmeta{tag} = $1;
+ $recmeta{ind1} = $2;
+ $recmeta{ind2} = $3;
}
- }
+ # automatable subfield maladies
+ $record[$recptr] =~ s/code=" ">c/code="c">/;
+ $record[$recptr] =~ s/code=" ">\$/code="c"$>/;
+ }
}
-print $NUMARC "</collection>\n";
-print $EXMARC "</collection>\n";
-print $OUT "\nDone. \n";
+
+#-----------------------------------------------------------------------------------
+# driver routines
+#-----------------------------------------------------------------------------------
+
=head2 edit
=cut
sub edit {
- my ($msg, $line_in) = @_;
- return if $trash{$recmeta{tag}};
- message($msg);
+ my ($msg) = @_;
+
+ return if $conf->{trash}{ $recmeta{tag} };
+ message($msg, 1);
print_context();
while (1) {
{ next unless (defined $commands{$chunks[0]}) }
if (defined $commands{$chunks[0]}) {
- my $term = $commands{$chunks[0]}->($line_in, @chunks[1..$#chunks]);
+ my $term = $commands{$chunks[0]}->(@chunks[1..$#chunks]);
last if $term;
} else {
- if ($context[3] eq " [LINE KILLED]\n") {
- push @record, "$line\n"
- } else {
- $record[-1] = "$line\n";
- }
- $context[3] = "$line\n";
+ $record[$recptr] = "$line\n";
print_linecontext();
}
}
}
-=head2 getline
+=head2 buildrecord
-Reads from the incoming MARC file; returns lines into the driver
-loop. Batches records for output, and maintains the context listing.
+Constructs record arrays from the incoming MARC file and returns them
+to the driver loop.
=cut
-sub getline {
- my $l = <MARC>;
- $count++;
- if (defined $l) {
- if ($l =~ /<record>/) {
- @record = ($l);
- %recmeta = ();
- $reccount++;
- } elsif ($l =~ m|</record>|) {
- push @record, $l;
- write_record($NUMARC) if $reccount;
- } else {
- push @record, $l;
- }
- }
- return $l;
+sub buildrecord {
+ my $l = '';
+ $l = <MARC> while (defined $l and $l !~ /<record>/);
+ return $l unless defined $l;
+ @record = ($l);
+ %recmeta = ();
+ $conf->{ricount}++;
+
+ until ($l =~ m|</record>|)
+ { push @record, $l; $l = <MARC>; }
+ push @record, $l;
+ return 1;
}
sub write_record {
my ($FH) = @_;
- $oreccount++ if ($FH eq $NUMARC);
+ my $trash = $conf->{trash};
+
+ $conf->{rocount}++ if ($FH eq $NUMARC);
print $FH '<!-- ', $recmeta{explanation}, " -->\n"
if(defined $recmeta{explanation});
+
+ # excise unwanted tags
+ if (keys %{$trash} or $conf->{autoscrub}) {
+ my @trimmed = ();
+ my $istrash = 0;
+ for my $line (@record) {
+ if ($istrash) {
+ $istrash = 0 if $line =~ m|</datafield|;
+ next;
+ }
+ if ($line =~ m/<datafield tag="(.{3})"/) {
+ my $tag = $1;
+ if ($trash->{$tag} or ($conf->{autoscrub} and $tag =~ /\D/)) {
+ $istrash = 1;
+ next
+ }
+ }
+ push @trimmed, $line;
+ }
+ @record = @trimmed;
+ }
+
+ # scrub newlines
+ unless ($conf->{nocollapse}) {
+ s/\n// for (@record);
+ }
+
+ # add 903(?) with new record id
+ my $renumber = '';
+ if ($conf->{'renumber-from'}) {
+ $renumber = join('', '<datafield tag="', $conf->{'renumber-tag'},
+ '" ind1=" " ind2=" ">',
+ '<subfield code="', $conf->{'renumber-subfield'}, '">',
+ $conf->{'renumber-from'}, '</subfield></datafield>');
+ $renumber .= "\n" if $conf->{nocollapse};
+ push @record, $renumber;
+ $conf->{'renumber-from'}++;
+ }
+
print $FH @record;
+ print $FH "</record>\n";
+}
+
+sub print_context {
+ print " Tag:",$recmeta{tag}, " Ind1:'",
+ $recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'\n";
+ print_linecontext();
+ return 0;
}
-sub update_linecontext {
- my $line2 = <MARC2>;
- push @context, $line2;
- shift @context if (@context > 5);
+sub print_linecontext {
+ my $low = ($recptr - 3 < 0) ? 0 : $recptr - 3;
+ print $OUT ' |', $record[$_] for ($low .. $recptr - 1);
+ print $OUT '==> |', $record[$recptr];
+ print $OUT ' |', $record[$recptr + 1], "\n";
+ return 0;
}
sub message {
- my ($msg) = @_;
- print $OUT "\r$msg at record $reccount/",$oreccount + 1,"\n";
+ my ($msg, $new) = @_;
+ print $OUT "\r", ' ' x 72, "\n" if $new;
+ print $OUT "\r$msg at ",$conf->{ricount},"/",$conf->{rocount} + 1,"\n";
}
$char = "\\" . $char if ($char =~ /\W/);
$from = join('', $from, $char);
}
- $record[-1] =~ s/$from/$to/;
- $context[3] = $record[-1];
+ $record[$recptr] =~ s/$from/$to/;
print_linecontext();
return 0;
}
sub merge_lines {
- my $last = pop @record;
- $last =~ s/^\s+//;
- $record[-1] =~ s/\n//;
- $record[-1] = join('', $record[-1], $last);
- my @temp = ("\n");
- push @temp, @context[0..1];
- $temp[3] = $record[-1];
- $temp[4] = $context[4];
- @context = @temp;
- print_linecontext();
+ # remove <subfield stuff; extract (probably wrong) subfield code
+ $record[$recptr] =~ s/^\s*<subfield code="(.*?)">//;
+ # and move to front of line
+ $record[$recptr] = join(' ', $1 , $record[$recptr]);
+ # tear off trailing subfield tag from preceeding line
+ $record[$recptr - 1] =~ s|</subfield>\n||;
+ # join current line onto preceeding line
+ $record[$recptr - 1] = join('', $record[$recptr - 1], $record[$recptr]);
+ # erase current line
+ my @a = @record[0 .. $recptr - 1];
+ my @b = @record[$recptr + 1 .. $#record];
+ @record = (@a, @b);
+ # move record pointer to previous line
+ prev_line();
return 0;
}
sub kill_line {
- pop @record;
- $context[3] = " [LINE KILLED]\n";
+ my @a = @record[0 .. $recptr - 1];
+ my @b = @record[$recptr + 1 .. $#record];
+ @record = (@a, @b);
print_linecontext();
return 0;
}
sub dump_record {
my ($line_in, @explanation) = @_;
- $recmeta{explanation} = join(' ', @explanation);
- my $line = <MARC>; $count++;
- update_linecontext();
- until ($line =~ m|</record>|) {
- push @record, $line;
- $line = <MARC>; $count++;
- update_linecontext();
- }
- push @record, $line;
+ $recmeta{explanation} = join(' ', 'Tag', $recmeta{tag}, @explanation);
write_record($EXMARC);
return 1;
}
-sub commit_edit { return 1 }
-
-sub print_context {
- print "\n Tag:",$recmeta{tag}, " Ind1:'",
- $recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'";
+sub next_line {
+ $recptr++;
print_linecontext();
return 0;
}
-sub print_linecontext {
- print $OUT "\n", join(' |','',@context[0..2]);
- print $OUT '==> |', $context[3];
- print $OUT ' |', $context[4],"\n";
+sub prev_line {
+ $recptr--;
+ print_linecontext();
return 0;
}
+sub commit_edit { return 1 }
+
sub show_original {
my ($line_in) = @_;
print $OUT "\n$line_in\n";
Type a replacement for the indicated line, or enter a command.
-Commands: c Show record context ('C' for brief context)
+Commands: c Show record context again ('C' for brief context)
k Kill indicated line (remove from record)
m Merge indicated line with previous line
o Show original line
# specifying a tag twice is an error, to help prevent typos
sub populate_trash {
- open TRASH, '<', '.trashtags';
+ print $OUT ">>> TRASHTAGS FILE FOUND. LOADING TAGS TO BE STRIPPED FROM OUTPUT\n";
+ open TRASH, '<', $conf->{trash}
+ or die "Can't open trash tags file!\n";
while (<TRASH>) {
my $lastwasrange = 0;
my %lastrange = ( high => 0, low => 0);
die "Unknown chunk $chunk in .trashtags file (line $.)\n";
}
}
- print $OUT join ",", (sort keys %trash);
- exit
+
+ # remove original id sequence tag from trash hash if we know it
+ trash_add($conf->{'original-tag'}, 1)
+ if ($conf->{'original-tag'} and $conf->{trash}{ $conf->{'original-tag'} });
}
sub trash_add_range {
sub trash_add {
my ($tag, $except) = @_;
+ my $trash = $conf->{trash};
+
die "Trash values must be valid tags (000-999)\n"
unless ($tag >= 0 and $tag <= 999);
+
if ($except) {
- delete $trash{$tag};
+ delete $trash->{$tag};
} else {
die "Trash tag '$tag' specified twice (line $.)\n"
- if $trash{$tag};
- $trash{$tag} = 1;
+ if $trash->{$tag};
+ $trash->{$tag} = 1;
}
}
+
+#-----------------------------------------------------------------------
+
+=head2 initialize
+
+Performs boring script initialization. Handles argument parsing,
+mostly.
+
+=cut
+
+sub initialize {
+ my ($c) = @_;
+ my @missing = ();
+
+ # set mode on existing filehandles
+ binmode(STDIN, ':utf8');
+
+ my $rc = GetOptions( $c,
+ 'autoscrub|a',
+ 'exception|x=s',
+ 'output|o=s',
+ 'nocollapse|n',
+ 'renumber-from|rf=i',
+ 'original-tag|ot=i',
+ 'renumber-tag|rt=i',
+ 'renumber-subfield|rt=i',
+ 'trash|t=s',
+ 'help|h',
+ );
+ show_help() unless $rc;
+ show_help() if ($c->{help});
+
+ # defaults
+ $c->{output} = 'incoming.cleaned.marc.xml' unless defined $c->{output};
+ $c->{exception} = 'incoming.exception.marc.xml' unless defined $c->{exception};
+ $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'};
+ $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'};
+
+ my @keys = keys %{$c};
+ show_help() unless (@ARGV and @keys);
+ #for my $key ('runtype', 'tag', 'subfield', 'output', 'exception')
+ # { push @missing, $key unless $c->{$key} }
+ #if (@missing) {
+ # print "Required option: ", join(', ', @missing), " missing!\n";
+ # show_help();
+ #}
+}
+
+sub show_help {
+ print <<HELP;
+Usage is: $0 [OPTIONS] <filelist>
+Options
+ --output -o Cleaned MARCXML output filename (default: incoming.cleaned.marc.xml)
+ --exception -x Exception (dumped records) MARCXML filename (incoming.exception.marc.xml)
+HELP
+exit;
+}