use strict;
use Getopt::Long;
-use Time::HiRes qw/time/;
-use MARC::Record;
-use MARC::File::XML ( BinaryEncoding => 'utf-8' );
-
-# THIS FILE EXTRACTS NONMATCHING RECORDS
+#use MARC::Record;
+#use MARC::File::XML ( BinaryEncoding => 'utf-8' );
# configuration hashref
-my $conf = ();
-#initialize($conf);
-
-my $idfile = shift;
-my $marcfile = shift;
-my $import = shift;
-my $shelve = shift;
+my $conf = {};
+initialize($conf);
my %id;
-open F, "<$idfile";
+open F, "<", $conf->{idfile};
while (<F>) {
chomp;
$id{$_} = 1;
}
-
close F;
my $M; my $I; my $S;
-open $M, '<:utf8', $marcfile;
-open $I, '>:utf8', $import;
-open $S, '>:utf8', $shelve;
-
-my $starttime = time;
-my $count = 0;
-my $icount = 0;
-my $scount = 0;
+open $M, '<:utf8', $conf->{marcfile};
+open $I, '>:utf8', $conf->{'output-import'};
+open $S, '>:utf8', $conf->{'output-shelve'};
+
while (<$M>) {
- /tag="903" ind1=" " ind2=" ">.*?<subfield code="a">(\d+)</;
- if ( $id{$1} ) {
- print $S $_;
- $scount++;
+ my $tag = $conf->{tag};
+ my $sub = $conf->{subfield};
+
+ /tag="$tag" ind1=" " ind2=" ">.*?<subfield code="$sub">(\d+)</;
+ if ($conf->{incoming}) {
+ print $S $_ if ($id{$1});
+ print $I $_ unless ($id{$1});;
} else {
- print $I $_;
- $icount++;
+ print $S $_ unless ($id{$1});
+ print $I $_ if ($id{$1});;
}
- $count++;
+ $conf->{count}++;
- unless ($count && $count % 100) {
- print STDERR "\r$count\t(shelved: $scount, import: $icount)\t". $count / (time - $starttime);
+ unless ($conf->{count} % 100) {
+ print STDERR "\rProcessed: ",$conf->{count};
}
}
my $rc = GetOptions( $c,
'incoming',
'incumbent',
- 'incoming-tag|incot=i',
- 'incoming-subfield|incos=s',
- 'incumbent-tag|incut=i',
- 'incumbent-subfield|incus=s',
- 'output|o=s',
+ 'tag|t=i',
+ 'subfield|s=s',
+ 'idfile|i=s',
+ 'marcfile|m=s',
+ 'outputimport|oi=s',
+ 'outputshelved|os=s',
'help|h',
);
show_help() unless $rc;
$c->{'incoming-tag'} = 903;
$c->{'incoming-subfield'} = 'a';
- $c->{'incoming-matchfile'} = '';
- $c->{'incoming-nomatchfile'} = '';
$c->{'incumbent-tag'} = 901;
- $c->{'incumbent-subfield'} = 'a';
- $c->{'incumbent-matchfile'} = '';
- $c->{'incumbent-nomatchfile'} = '';
+ $c->{'incumbent-subfield'} = 'c';
my @keys = keys %{$c};
- show_help() unless (@ARGV and @keys);
- for my $key ('renumber-from', 'tag', 'subfield', 'output')
+ unless ($c->{incoming} or $c->{incumbent}) {
+ print "One of --incoming or --incumbent is required.\n";
+ show_help();
+ }
+ if ($c->{incoming} and $c->{incumbent}) {
+ print "Only one of --incoming or --incumbent can be specified.\n";
+ show_help();
+ }
+ for my $key ('idfile', 'marcfile', 'output-import', 'output-shelved')
{ push @missing, $key unless $c->{$key} }
if (@missing) {
print "Required option: ", join(', ', @missing), " missing!\n";
sub show_help {
print <<HELP;
+
+The purpose of this utility is to split a MARCXML file in twain,
+producing a set of records which will imported into Evergreen, and a
+set of records which will not.
+
Usage is: $0 [REQUIRED ARGS]
Req'd Arguments
- --renumber-from=N -rf First id# of new sequence
- --tag=N -t Which tag to use
- --subfield=X -s Which subfield to use
- --output=<file> -o Output filename
-
-Any number of input files may be specified; one output file will result.
+ --incoming \\___ One (and only one) of these two must
+ --incumbent / be specified
+
+ If --incoming is specified, the record ids in the file specified by
+ --idfile will be used as EXCLUSION data. That is, the given record
+ ids will be treated as records which match incumbent records and are
+ being compressed into existing data, and so WILL NOT be
+ imported. The --output-import file will contain records whose ids DO
+ NOT occur in --idfile; --output-shelve will contain the records
+ which DO occur.
+
+ If --incumbent is specified, the reverse occurs.
+
+ --idfile -i File of record ids to use as source for matchpoints
+ --marcfile -m MARCXML source file
+ --output-import -oi Output MARCXML file for records to be imported
+ --output-shelve -os Output MARCXML file for records to be ignored
+
+Optional Arguments
+ --tag -t MARC tag to use as matchpoint (default 903 for incoming,
+ 901 for incumbent)
+ --subfield -s Subfield of tag to use (default 'c' for incoming, 'a'
+ for incumbent)
HELP
exit 1;
}