From 9629f1b4347cebc0af040e89dde5c739e5edd3d9 Mon Sep 17 00:00:00 2001 From: Shawn Boyette Date: Mon, 10 Nov 2008 17:25:52 +0000 Subject: [PATCH] miker_filter integration work finally proceeding --- filter_record.ids | 114 ++++++++++++++++++++++++++++++----------------------- 1 files changed, 65 insertions(+), 49 deletions(-) diff --git a/filter_record.ids b/filter_record.ids index 99c6171..1f2b754 100644 --- a/filter_record.ids +++ b/filter_record.ids @@ -3,53 +3,43 @@ use warnings; use strict; use Getopt::Long; -use Time::HiRes qw/time/; -use MARC::Record; -use MARC::File::XML ( BinaryEncoding => 'utf-8' ); - -# THIS FILE EXTRACTS NONMATCHING RECORDS +#use MARC::Record; +#use MARC::File::XML ( BinaryEncoding => 'utf-8' ); # configuration hashref -my $conf = (); -#initialize($conf); - -my $idfile = shift; -my $marcfile = shift; -my $import = shift; -my $shelve = shift; +my $conf = {}; +initialize($conf); my %id; -open F, "<$idfile"; +open F, "<", $conf->{idfile}; while () { chomp; $id{$_} = 1; } - close F; my $M; my $I; my $S; -open $M, '<:utf8', $marcfile; -open $I, '>:utf8', $import; -open $S, '>:utf8', $shelve; - -my $starttime = time; -my $count = 0; -my $icount = 0; -my $scount = 0; +open $M, '<:utf8', $conf->{marcfile}; +open $I, '>:utf8', $conf->{'output-import'}; +open $S, '>:utf8', $conf->{'output-shelve'}; + while (<$M>) { - /tag="903" ind1=" " ind2=" ">.*?(\d+){tag}; + my $sub = $conf->{subfield}; + + /tag="$tag" ind1=" " ind2=" ">.*?(\d+){incoming}) { + print $S $_ if ($id{$1}); + print $I $_ unless ($id{$1});; } else { - print $I $_; - $icount++; + print $S $_ unless ($id{$1}); + print $I $_ if ($id{$1});; } - $count++; + $conf->{count}++; - unless ($count && $count % 100) { - print STDERR "\r$count\t(shelved: $scount, import: $icount)\t". $count / (time - $starttime); + unless ($conf->{count} % 100) { + print STDERR "\rProcessed: ",$conf->{count}; } } @@ -70,11 +60,12 @@ sub initialize { my $rc = GetOptions( $c, 'incoming', 'incumbent', - 'incoming-tag|incot=i', - 'incoming-subfield|incos=s', - 'incumbent-tag|incut=i', - 'incumbent-subfield|incus=s', - 'output|o=s', + 'tag|t=i', + 'subfield|s=s', + 'idfile|i=s', + 'marcfile|m=s', + 'outputimport|oi=s', + 'outputshelved|os=s', 'help|h', ); show_help() unless $rc; @@ -82,15 +73,18 @@ sub initialize { $c->{'incoming-tag'} = 903; $c->{'incoming-subfield'} = 'a'; - $c->{'incoming-matchfile'} = ''; - $c->{'incoming-nomatchfile'} = ''; $c->{'incumbent-tag'} = 901; - $c->{'incumbent-subfield'} = 'a'; - $c->{'incumbent-matchfile'} = ''; - $c->{'incumbent-nomatchfile'} = ''; + $c->{'incumbent-subfield'} = 'c'; my @keys = keys %{$c}; - show_help() unless (@ARGV and @keys); - for my $key ('renumber-from', 'tag', 'subfield', 'output') + unless ($c->{incoming} or $c->{incumbent}) { + print "One of --incoming or --incumbent is required.\n"; + show_help(); + } + if ($c->{incoming} and $c->{incumbent}) { + print "Only one of --incoming or --incumbent can be specified.\n"; + show_help(); + } + for my $key ('idfile', 'marcfile', 'output-import', 'output-shelved') { push @missing, $key unless $c->{$key} } if (@missing) { print "Required option: ", join(', ', @missing), " missing!\n"; @@ -108,14 +102,36 @@ Display usage message when things go wrong sub show_help { print < -o Output filename - -Any number of input files may be specified; one output file will result. + --incoming \\___ One (and only one) of these two must + --incumbent / be specified + + If --incoming is specified, the record ids in the file specified by + --idfile will be used as EXCLUSION data. That is, the given record + ids will be treated as records which match incumbent records and are + being compressed into existing data, and so WILL NOT be + imported. The --output-import file will contain records whose ids DO + NOT occur in --idfile; --output-shelve will contain the records + which DO occur. + + If --incumbent is specified, the reverse occurs. + + --idfile -i File of record ids to use as source for matchpoints + --marcfile -m MARCXML source file + --output-import -oi Output MARCXML file for records to be imported + --output-shelve -os Output MARCXML file for records to be ignored + +Optional Arguments + --tag -t MARC tag to use as matchpoint (default 903 for incoming, + 901 for incumbent) + --subfield -s Subfield of tag to use (default 'c' for incoming, 'a' + for incumbent) HELP exit 1; } -- 1.7.2.5