From 50027a222d737e07d35e7f50f75cd033013f6db3 Mon Sep 17 00:00:00 2001 From: Don McMorris Date: Thu, 31 Jul 2008 20:49:23 +0000 Subject: [PATCH] changes by mike to skip crap records --- split_marc.pl | 20 ++++++++++++++++---- spot_check.pl | 10 +++++++++- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/split_marc.pl b/split_marc.pl index c60aab4..e6088b1 100644 --- a/split_marc.pl +++ b/split_marc.pl @@ -5,24 +5,29 @@ use MARC::Record; use MARC::File::XML ( BinaryEncoding => 'utf-8' ); use MARC::Field; use POSIX; +use Error qw/:try/; my $split_every = $ARGV[0]; my $count = 0; binmode(STDOUT, ':utf8'); binmode(STDIN, ':utf8'); +my $M; foreach $argnum ( 1 .. $#ARGV ) { + + open $M, '<:utf8', $ARGV[$argnum]; print STDERR "Processing " . $ARGV[$argnum] . "\n"; - my $batch = MARC::Batch->new('XML',$ARGV[$argnum]); + my $batch = MARC::Batch->new('XML', $M); $batch->strict_off(); $batch->warnings_off(); - while ( my $record = $batch->next() ) { - - $count++; + my $record; + while ( try { $record = $batch->next() } otherwise { $record = -1 } ) { + next if ($record == -1); + $count++; my $filename = $ARGV[$argnum] . ".split." . floor( $count / $split_every ) . ".xml"; @@ -30,6 +35,13 @@ foreach $argnum ( 1 .. $#ARGV ) { binmode(FILE, ':utf8'); print FILE $record->as_xml(); close FILE; + + $record = undef; + + unless ($count % 1000) { + print STDERR "$count\r" + } + } print STDERR "Processed $count records.\n"; } diff --git a/spot_check.pl b/spot_check.pl index 0da2c75..6f8011b 100644 --- a/spot_check.pl +++ b/spot_check.pl @@ -10,11 +10,15 @@ my $count = 0; binmode(STDOUT, ':utf8'); binmode(STDIN, ':utf8'); +my $M; + foreach $argnum ( 0 .. $#ARGV ) { print STDERR "Processing " . $ARGV[$argnum] . "\n"; - my $batch = MARC::Batch->new('XML',$ARGV[$argnum]); + open $M, '<:utf8', $ARGV[$argnum]; + + my $batch = MARC::Batch->new('XML',$M); $batch->strict_off(); $batch->warnings_off(); @@ -29,6 +33,10 @@ foreach $argnum ( 0 .. $#ARGV ) { print STDERR "WARNINGS: Record $count : " . join(":",@warnings) . " : continuing...\n" if ( @warnings ); + unless ($count % 1000) { + print STDERR "$count\r" + } + } }; print STDERR "Processed $count records. Last successful record = " . $last_successful_record . "\n"; -- 1.7.2.5