#!/usr/bin/perl # Copyright 2009-2012, Equinox Software, Inc. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. require 5.10.0; use strict; use warnings; use Getopt::Long; use Term::ReadLine; use Equinox::Migration::SimpleTagList; my $term = new Term::ReadLine 'yaz-cleanup'; my $OUT = $term->OUT || \*STDOUT; binmode STDOUT, ":utf8"; binmode $OUT, ":utf8"; $| = 1; # initialization and setup my $c = {}; initialize($c); # set up files, since everything appears to be in order open MARC, '<:utf8', $c->{marcfile} or die "Can't open input file $!\n"; open my $NUMARC, '>:utf8', $c->{output} or die "Can't open output file $!\n"; open my $OLD2NEW, '>', 'old2new.map' if ($c->{'renumber-from'} and $c->{'original-tag'}); my $EXMARC = 'EX'; print $NUMARC "\n"; $c->{totalrecs} = `grep -c '{marcfile}`; chomp $c->{totalrecs}; $c->{percent} = 0; my @record; # current record storage my %recmeta; # metadata about current record my $ptr = 0; # record index pointer # this is the dispatch table which drives command selection in # edit(), below my %commands = ( c => \&print_fullcontext, n => \&next_line, p => \&prev_line, '<' => \&widen_window, '>' => \&narrow_window, d => \&display_lines, o => \&insert_original, k => \&kill_line, y => \&yank_line, f => \&flip_line, m => \&merge_lines, s => \&substitute, t => \&commit_edit, x => \&dump_record, q => \&quit, '?' => \&help, h => \&help, help => \&help, ); my @spinner = qw(- \\ | /); my $sidx = 0; while ( buildrecord() ) { unless ($c->{ricount} % 50) { $c->{percent} = int(($c->{ricount} / $c->{totalrecs}) * 100); print "\rWorking (",$c->{percent},"%) ", $spinner[$sidx]; $sidx = ($sidx == $#spinner) ? 0 : $sidx + 1; } my $rc = do_automated_cleanups(); next if $rc; $ptr = 0; until ($ptr == $#record) { # get datafield/tag data if we have it $rc = stow_record_data() if ($c->{'renumber-from'} and $c->{'original-tag'}); next if $rc; # naked ampersands if ($record[$ptr] =~ /&/ && $record[$ptr] !~ /&\w+?;/) { edit("Naked ampersand"); $ptr= 0; next } if ($record[$ptr] =~ /{autoscrub}; next; } } # subfields can't be non-alphanumeric if ($record[$ptr] =~ /\n"; print $OUT "\nDone. ",$c->{ricount}," in; ",$c->{rocount}," dumped \n"; #----------------------------------------------------------------------------------- # cleanup routines #----------------------------------------------------------------------------------- sub do_automated_cleanups { $ptr = 0; until ($ptr == $#record) { # catch empty datafield elements if ($record[$ptr] =~ m/|) { my @a = @record[0 .. $ptr - 1]; my @b = @record[$ptr + 2 .. $#record]; @record = (@a, @b); @a = undef; @b = undef; message("Empty datafield scrubbed"); $ptr = 0; next; } } # and quasi-empty subfields if ($record[$ptr] =~ m|(.*?)(.+?)|) { my $leader = $1; if (length $leader < 24) { $leader .= ' ' x (20 - length($leader)); $leader .= "4500"; $record[$ptr] = "$leader\n"; message("Short leader padded"); } } if ($c->{'force-utf8'}) { if ($record[$ptr] =~ m|(.........).(.+)|) { $record[$ptr] = "$1a$2\n"; } } if ($record[$ptr] =~ m|(.+?)$content\n"; message("Short 008 padded"); } } # clean misplaced dollarsigns if ($record[$ptr] =~ m|c?\d+\.\d{2}|) { $record[$ptr] =~ s|"\$">c?(\d+\.\d{2})|"c">\$$1|; message("Dollar sign corrected"); } # excessive trailing whitespace in subfield contents if ($record[$ptr] =~ m|\s{10,}|) { $record[$ptr] =~ s|\s{10,}||; message("Trailing whitespace trimmed from subfield contents"); } # automatable subfield maladies if ($record[$ptr] =~ /code=" ">c/) { message('Fixing probable subfield c, scenario 1'); $record[$ptr] =~ s/code=" ">c/code="c">/; } if ($record[$ptr] =~ /code=" ">\$/) { message('Fixing probable subfield c, scenario 2'); $record[$ptr] =~ s/code=" ">\$/code="c">\$/; } if ($c->{'fix-subfield'}) { if ($record[$ptr] =~ /code="&">/) { message('Fixing & for subfield code'); $record[$ptr] =~ s/code="&">/code="$c->{'fix-subfield'}">/; } if ($record[$ptr] =~ /code="(.*?\P{IsAlnum}.*?)">/) { message("Fixing non-alphanumeric subfield code: $1 -> " . $c->{'fix-subfield'}); $record[$ptr] =~ s/code=".*?\P{IsAlnum}.*?">/code="$c->{'fix-subfield'}">/; } if ($record[$ptr] =~ /code="">/) { message('Fixing null subfield code'); $record[$ptr] =~ s/code="">/code="$c->{'fix-subfield'}">/; } } } return 0; } sub stow_record_data { # get tag data if we're looking at it my $tag = 0; if ($record[$ptr] =~ m/<(?:control|data)field tag="(.{3})"/) { $recmeta{tag} = $1; $tag = $recmeta{tag}; $record[$ptr] =~ m/ind1="(.)"/; $recmeta{ind1} = $1 || ''; $record[$ptr] =~ m/ind2="(.)"/; $recmeta{ind2} = $1 || ''; unless ($tag) { message("Autokill record: no detectable tag"); dump_record("No detectable tag") ; return 1; } # and since we are looking at a tag, see if it's the original id if ($tag == $c->{'original-tag'}) { my $oid = 0; if ($tag < 10) { # controlfield if ($record[$ptr] =~ m|(.+?)|) { $oid = $1; print $OLD2NEW "$oid\t", $recmeta{nid}, "\n" } } elsif ($tag >= 10 and $c->{'original-subfield'}) { # datafield my $line = $record[$ptr]; my $lptr = $ptr; my $osub = $c->{'original-subfield'}; # skim to end of this tag until ($line =~ m||) { if ($line =~ /(.+?)new map file if ($c->{'renumber-from'} and $c->{'original-subfield'}) { } } } return 0; } #----------------------------------------------------------------------------------- # driver routines #----------------------------------------------------------------------------------- =head2 edit Handles the Term::ReadLine loop =cut sub edit { my ($msg) = @_; return if $c->{trash}->has( $recmeta{tag} ); if ( $c->{fullauto} ) { dump_record($msg); return } $c->{editmsg} = $msg; print_fullcontext(); # stow original problem line $recmeta{origline} = $record[$ptr]; while (1) { my $line = $term->readline('marc-cleanup>'); my @chunks = split /\s+/, $line; # lines with single-character first chunks are commands. # make sure they exist. if (length $chunks[0] == 1) { unless (defined $commands{$chunks[0]}) { print $OUT "No such command '", $chunks[0], "'\n"; next; } } if (defined $commands{$chunks[0]}) { my $term = $commands{$chunks[0]}->(@chunks[1..$#chunks]); last if $term; } else { $recmeta{prevline} = $record[$ptr]; $record[$ptr] = "$line\n"; print_context(); } } # set pointer to top on the way out $ptr = 0; } =head2 buildrecord Constructs record arrays from the incoming MARC file and returns them to the driver loop. =cut sub buildrecord { my $l = ''; my $istrash = 0; my $trash = $c->{trash}; $l = while (defined $l and $l !~ /{ricount}++; for (keys %recmeta) { $recmeta{$_} = undef } for (0 .. @record) { delete $record[$_] } my $i = 0; until ($l =~ m||) { # clean up tags with spaces in them $l =~ s/tag=" /tag="00/g; $l =~ s/tag=" /tag="0/g; $l =~ s/tag="-/tag="0/g; $l =~ s/tag="(\d\d) /tag="0$1/g; # excise unwanted tags if ($istrash) { $istrash = 0 if ($l =~ m|; next; } if ($l =~ m/has($1) or ($c->{autoscrub} and $1 =~ /\D/)) { $istrash = 1; next } } push @record, $l; $l = ; $i++; } # add 903(?) with new record id if ($c->{'renumber-from'}) { $recmeta{nid} = $c->{'renumber-from'}; push @record, join('', ' ', $recmeta{nid}, "\n"); $c->{'renumber-from'}++; } $i++; push @record, $l; return 1; } sub write_record { my ($FH) = @_; if ($FH eq 'EX') { $EXMARC = undef; open $EXMARC, '>:utf8', $c->{exception} or die "Can't open exception file $!\n"; $FH = $EXMARC; } print $FH '\n" if(defined $recmeta{explanation}); # scrub newlines (unless told not to or writing exception record) unless ($c->{nocollapse} or $FH eq $EXMARC) { s/\n// for (@record) } # actually write the record print $FH @record,"\n"; # increment output record count (if not exception) $c->{rocount}++ if ($FH eq $EXMARC); # if we were dumping to exception file, nuke the record and set ptr # to terminate processing loop @record = ('a'); $ptr = 0; } sub print_fullcontext { print $OUT "\r", ' ' x 72, "\n"; print $OUT $c->{editmsg},"\n"; print $OUT "\r Tag:",$recmeta{tag}, " Ind1:'", $recmeta{ind1},"' Ind2:'", $recmeta{ind2}, "'"; print $OUT " @ ", $c->{ricount}, "/", $c->{totalrecs}; print_context(); return 0; } sub print_context { my $upper = int($c->{window} / 2) + 1; my $lower = int($c->{window} / 2) - 1; my $start = ($ptr - $upper < 0) ? 0 : $ptr - $upper; my $stop = ($ptr + $lower > $#record) ? $#record : $ptr + $lower; print $OUT "\n"; print $OUT ' |', $record[$_] for ($start .. $ptr - 1); print $OUT '==> |', $record[$ptr]; print $OUT ' |', $record[$_] for ($ptr + 1 .. $stop); print $OUT "\n"; return 0; } sub message { my ($msg) = @_; print $OUT "\r$msg at ",$c->{ricount},"/",$c->{totalrecs}, "\n"; } #----------------------------------------------------------------------------------- # command routines #----------------------------------------------------------------------------------- sub substitute { my (@chunks) = @_; my $ofrom = shift @chunks; if ($ofrom =~ /^'/) { until ($ofrom =~ /'$/ or !@chunks) { $ofrom .= join(' ','',shift @chunks) } $ofrom =~ s/^'//; $ofrom =~ s/'$//; } my $to = shift @chunks; if ($to =~ /^'/) { until ($to =~ /'$/ or !@chunks) { $to .= join(' ','',shift @chunks) } $to =~ s/^'//; $to =~ s/'$//; } my $from = ''; for my $char (split(//,$ofrom)) { $char = "\\" . $char if ($char =~ /\W/); $from = join('', $from, $char); } $recmeta{prevline} = $record[$ptr]; $record[$ptr] =~ s/$from/$to/; $ofrom = undef; $to = undef; $from = undef; print_context(); return 0; } sub merge_lines { $recmeta{prevline} = $record[$ptr]; # remove //; # and move to front of line $record[$ptr] = join(' ', $1 , $record[$ptr]); # tear off trailing subfield tag from preceeding line $record[$ptr - 1] =~ s|\n||; # join current line onto preceeding line $record[$ptr - 1] = join('', $record[$ptr - 1], $record[$ptr]); # erase current line my @a = @record[0 .. $ptr - 1]; my @b = @record[$ptr + 1 .. $#record]; @record = (@a, @b); # move record pointer to previous line prev_line(); print_context(); return 0; } sub flip_line { unless ($recmeta{prevline}) { print $OUT "No previously edited line to flip\n"; return } my $temp = $record[$ptr]; $record[$ptr] = $recmeta{prevline}; $recmeta{prevline} = $temp; $temp = undef; print_context(); return 0; } sub kill_line { $recmeta{killline} = $record[$ptr]; my @a = @record[0 .. $ptr - 1]; my @b = @record[$ptr + 1 .. $#record]; @record = (@a, @b); @a = undef; @b = undef; print_context(); return 0; } sub yank_line { unless ($recmeta{killline}) { print $OUT "No killed line to yank\n"; return } my @a = @record[0 .. $ptr - 1]; my @b = @record[$ptr .. $#record]; @record = (@a, $c->{killline}, @b); @a = undef; @b = undef; print_context(); return 0; } sub insert_original { $record[$ptr] = $recmeta{origline}; print_context(); return 0; } sub display_lines { print $OUT "\nOrig. edit line :", $recmeta{origline}; print $OUT "Current flip line:", $recmeta{prevline} if $recmeta{prevline}; print $OUT "Last killed line :", $recmeta{killline} if $recmeta{killline}; print $OUT "\n"; return 0; } sub dump_record { my (@explanation) = @_; $recmeta{explanation} = join(' ', 'DUMPING RECORD: Tag', $recmeta{tag}, @explanation); message( $recmeta{explanation} ); write_record($EXMARC); return 1; } sub next_line { $ptr++ unless ($ptr == $#record);; print_context(); return 0; } sub prev_line { $ptr-- unless ($ptr == 0); print_context(); return 0; } sub commit_edit { return 1 } sub widen_window { if ($c->{window} == 15) { print $OUT "Window can't be bigger than 15 lines\n"; return } $c->{window} += 2; print_context; } sub narrow_window { if ($c->{window} == 5) { print $OUT "Window can't be smaller than 5 lines\n"; return } $c->{window} -= 2; print_context; } sub help { print $OUT < Contract context window | y Yank last killed line p Move pointer to prev line | m Merge current line into preceding line n Move pointer to next line | o Insert original line c Print line context | f Flip current line and last edited line d Print current saved lines | -----------------------------+------------------------------------------- s Subtitute; replace ARG1 in current line with ARG2. If either ARG contains spaces, it must be single-quoted t Commit changes and resume automated operations x Dump record to exception file q Quit HELP return 0; } sub quit { exit } #----------------------------------------------------------------------- =head2 initialize Performs boring script initialization. Handles argument parsing, mostly. =cut sub initialize { my ($c) = @_; my @missing = (); # set mode on existing filehandles binmode(STDIN, ':utf8'); my $rc = GetOptions( $c, 'autoscrub|a', 'fullauto', 'exception|x=s', 'output|o=s', 'marcfile|m=s', 'prefix|p=s', 'nocollapse|n', 'renumber-from|rf=i', 'renumber-tag|rt=i', 'renumber-subfield|rs=s', 'original-tag|ot=i', 'original-subfield|os=s', 'fix-subfield|fs=s', 'force-utf8', 'script', 'no-strip9', 'trashfile|t=s', 'trashhelp', 'help|h', ); show_help() unless $rc; show_help() if ($c->{help}); show_trashhelp() if ($c->{trashhelp}); # defaults my $pfx = defined($c->{prefix}) ? $c->{prefix} : "bibs"; $c->{ricount} = 0; $c->{rocount} = 0; $c->{'renumber-tag'} = 903 unless defined $c->{'renumber-tag'}; $c->{'renumber-subfield'} = 'a' unless defined $c->{'renumber-subfield'}; $c->{window} = 9; if ($c->{marcfile} and $c->{prefix}) { abort('You can not declare a marc file and prefix.'); } if ($c->{marcfile}) { $c->{output} = join('.',$c->{marcfile},'clean') unless $c->{output}; $c->{exception} = join('.',$c->{marcfile},'exception') unless $c->{exception}; } if ($c->{prefix}) { $c->{output} = join('.',$c->{prefix},'clean','marc','xml') unless $c->{output}; $c->{exception} = join('.',$c->{prefix},'exception','marc','xml') unless $c->{exception}; $c->{marcfile} = $c->{prefix} . '.marc.xml' unless $c->{marcfile}; } show_help() unless ($c->{marcfile} and $c->{output}); if ($c->{trashfile}) { $c->{trash} = Equinox::Migration::SimpleTagList->new(file => $c->{trashfile}) } else { $c->{trash} = Equinox::Migration::SimpleTagList->new; } # autotrash 901, 903 unless no strip-nines unless ($c->{'no-strip9'}) { $c->{trash}->add_tag(901); $c->{trash}->add_tag(903); } # remove original id sequence tag from trash hash if we know it $c->{trash}->remove_tag($c->{'original-tag'}) if ( $c->{'original-tag'} and $c->{trash}->has($c->{'original-tag'}) ); } sub abort { my $msg = shift; print STDERR "$0: $msg", "\n"; exit 1; } sub show_help { print < Options --output -o Cleaned MARCXML output filename --exception -x Exception (dumped records) MARCXML filename or --prefix= -p Shared prefix for output/exception files. Will produce PREFIX.clean.marc.xml and PREFIX.exception.marc.xml --marcfile -m Input filename. Defaults to PREFIX.marc.xml --renumber-from -rf Begin renumbering id sequence with this number --renumber-tag -rt Tag to use in renumbering (default: 903) --renumber-subfield -rs Subfield code to use in renumbering (default: a) --original-tag -ot Original id tag; will be kept in output even if it appears in the trash file --original-subfield -os Original id subfield code. If this is specified and renumbering is in effect, an old-to-new mapping file (old2new.map) will be generated. --force-utf8 Rewrite each record so that they describe themselves as UTF-8 encoded --autoscrub -a Automatically remove non-numeric tags in data --fix-subfield -fs Subfield code to use in place of non-alphanumeric or empty subfield codes --nocollapse -n Don't compress records to one line on output --no-strip9 Don't autoremove 901/903 tags in data --trashfile -t File containing trash tag data (see --trashhelp) --fullauto No manual edits. All problematic records dumped to exception file. HELP exit; } sub show_trashhelp { print <