+++ /dev/null
-#!/usr/bin/perl
-use warnings;
-use strict;
-
-use Getopt::Long;
-
-# configuration hashref
-my $conf = {};
-initialize($conf);
-
-my %id;
-
-open F, "<", $conf->{idfile};
-while (<F>) {
- chomp;
- $id{$_} = 1;
-}
-close F;
-
-my $M; my $I; my $S;
-open $M, '<:utf8', $conf->{marcfile}
- or die "Can't open marcfile '",$conf->{marcfile},"'\n";
-open $I, '>:utf8', $conf->{'output-import'}
- or die "Can't open import file '",$conf->{'output-import'},"'\n";
-open $S, '>:utf8', $conf->{'output-shelved'}
- or die "Can't open shelf file '",$conf->{'output-shelve'},"'\n";
-
-while (<$M>) {
- my $tag = $conf->{tag};
- my $sub = $conf->{subfield};
-
- /tag="$tag" ind1=" " ind2=" ">.*?<subfield code="$sub">(\d+)</;
- if ($conf->{mode} eq "exclude") {
- print $S $_ if ($id{$1});
- print $I $_ unless ($id{$1});;
- } else {
- print $S $_ unless ($id{$1});
- print $I $_ if ($id{$1});;
- }
- $conf->{count}++;
-
- unless ($conf->{count} % 100) {
- print STDERR "\rProcessed: ",$conf->{count};
- }
-}
-
-=head2 initialize
-
-Performs boring script initialization. Handles argument parsing,
-mostly.
-
-=cut
-
-sub initialize {
- my ($c) = @_;
- my @missing = ();
-
- # set mode on existing filehandles
- binmode(STDIN, ':utf8');
-
- my $rc = GetOptions( $c,
- 'mode=s',
- 'include',
- 'tag|t=i',
- 'subfield|s=s',
- 'idfile|i=s',
- 'marcfile|m=s',
- 'output-import|oi=s',
- 'output-shelved|os=s',
- 'help|h',
- );
- show_help() unless $rc;
- show_help() if ($c->{help});
-
- my @keys = keys %{$c};
- for my $key ('mode', 'idfile', 'marcfile', 'tag', 'subfield',
- 'output-import', 'output-shelved')
- { push @missing, $key unless $c->{$key} }
- if (@missing) {
- print "Required option: ", join(', ', @missing), " missing!\n";
- show_help();
- }
- unless ($c->{mode} eq "include" or $c->{mode} eq "exclude") {
- print "Unknown run mode '", $c->{mode}, "'\n";
- show_help();
- }
-}
-
-
-=head2 show_help
-
-Display usage message when things go wrong
-
-=cut
-
-sub show_help {
-print <<HELP;
-Usage is: $0 [ARGS]
-
- --mode=MODE Runmode to use (exclude, include; usually exclude)
- --idfile -i File of record ids to use as source for matchpoints
- --marcfile -m MARCXML source file
- --tag -t MARC tag to use as matchpoint (903 when matching
- incoming records, 901 when matching incumbents)
- --subfield -s Subfield of tag to use ('a' for incoming, 'c'
- for incumbent)
- --output-import -oi Output MARCXML file for records to be imported
- --output-shelved -os Output MARCXML file for records to be ignored
-
-If '--mode=exclude' is specified, the record ids in the file specified
-by -idfile will be used as EXCLUSION data.
-
-That is, the given record ids will be treated as records which match
-incumbent records, are being compressed into existing data, and so
-WILL NOT be imported. The --output-import file will contain records
-whose ids DO NOT occur in --idfile; --output-shelved will contain the
-records which DO occur.
-
-If '--mode=include' is specified, the reverse occurs.
-HELP
-exit 1;
-}
+++ /dev/null
-#!/usr/bin/perl
-
-my $dataset = $ARGV[0];
-
-my $match_to = $ARGV[1];
-my $match_these = $ARGV[2];
-my $match_to_score = $ARGV[3];
-my $match_these_score = $ARGV[4];
-
-print "match_to: $match_to match_these: $match_these\n";
-
-my %pines;
-my %incoming;
-my %match;
-my %candidate_match;
-my %score;
-
-
-# create HOL of incumbent{fingerprint}[ids]
-open FILE, $match_to;
-while (my $line = <FILE>) {
- chomp $line;
- my @fields = split(/\t/,$line);
- my $id = shift @fields;
- my $fp = join '^', @fields;
- if (! defined $pines{ $fp }) { $pines{ $fp } = []; }
- push @{ $pines{ $fp } }, $id;
-}
-close FILE;
-
-# do the same for incoming
-open FILE, $match_these;
-while (my $line = <FILE>) {
- chomp $line;
- my @fields = split(/\t/,$line);
- my $id = shift @fields;
- my $fp = join '^', @fields;
- if (! defined $incoming{ $fp }) { $incoming{ $fp } = []; }
- push @{ $incoming{ $fp } }, $id;
-}
-close FILE;
-
-# scoring file stuffs, which i have never used
-foreach my $file ( $match_to_score, $match_from_score ) {
- open FILE, $file;
- while (my $line = <FILE>) {
- chomp $line;
- my @fields = split(/\|/,$line);
- my $id = shift @fields; $id =~ s/\D//g;
- my $holdings = shift @fields; $holdings =~ s/\D//g;
- my $subtitle = shift @fields; $subtitle =~ s/^\s+//; $subtitle =~ s/\s+$//;
- $score{ $id } = [ $holdings, $subtitle ];
- }
- close FILE;
-}
-
-
-open RECORD_IDS, ">match.record_ids";
-foreach my $fp ( keys %incoming ) {
- # for each incoming fingerprint,
- if (defined $pines{ $fp }) {
- # if there is a matching incumbent fingerprint
- foreach my $id ( @{ $incoming{ $fp } } ) {
- # print all incoming record ids
- print RECORD_IDS "$id\n";
- if ( ! defined $candidate_match{ $id } )
- # and create a mapping of incoming ids to fingerprints
- # (used for scoring)
- { $candidate_match{ $id } = []; }
- push @{ $candidate_match{ $id } }, $fp;
- }
- }
-}
-close RECORD_IDS;
-
-
-# scoring section, which i have never used
-foreach my $id ( keys %candidate_match ) {
- my $subtitle;
- # if score{id} exists set subtitle to the sc
- if (defined $score{ $id })
- { $subtitle = $score{ $id }[1]; }
-
- my @fps = @{ $candidate_match{ $id } };
- my @candidate_pines = ();
-
- my $subtitle_matched = 0;
- my $highest_holdings = 0;
- my $best_pines_id;
-
- foreach my $fp ( @fps ) {
- foreach my $pines_id ( @{ $pines{ $fp } } ) {
- my $pines_subtitle;
- if (defined $score{ $pines_id })
- { $pines_subtitle = $score{ $pines_id }[1]; }
- my $pines_holdings;
- if (defined $score{ $pines_id })
- { $pines_holdings = $score{ $pines_id }[0]; }
- if ($pines_subtitle eq $subtitle) {
- if (! $subtitle_matched) {
- $subtitle_matched = 1;
- $best_pines_id = $pines_id;
- $highest_holdings = -1;
- }
- } else {
- if ($subtitle_matched) { next; }
- }
- if ( $pines_holdings > $highest_holdings ) {
- $highest_holdings = $pines_holdings;
- $best_pines_id = $pines_id;
- }
- }
- }
-
- # this will silently fail, as the filehandle has been closed, but strict
- # is not enabled
- print RECORD_IDS "$best_pines_id\n";
-
- if (! defined $match{ $best_pines_id } )
- { $match{ $best_pines_id } = [ $best_pines_id ]; }
- push @{ $match{ $best_pines_id } }, $id;
-}
-
-
-
-open GROUPINGS, ">match.groupings";
-foreach my $k ( keys %match ) {
- print GROUPINGS join("^",
- "checking",
- $dataset,
- $match{ $k }[0],
- join(",",@{ $match{ $k } }),
- join(",",@{ $match{ $k } })
- ) . "\n";
-
-}
-close GROUPINGS;
-
-
+++ /dev/null
-#!/usr/bin/perl
-use strict;
-use warnings;
-
-use MARC::Batch;
-use Getopt::Long;
-#use MARC::Record;
-use MARC::File::XML ( BinaryEncoding => 'utf-8' );
-#use MARC::Field;
-
-$| = 1;
-
-my $count = 0;
-my $conf = {}; # configuration hashref
-initialize($conf);
-
-binmode(STDIN, ':utf8');
-
-open RENUMBER, '>', $conf->{output};
-binmode(RENUMBER, ':utf8');
-
-foreach my $input ( @ARGV ) {
- print STDERR "Processing $input, starting record id at ",
- $conf->{'renumber-from'},"\n";
-
- my $batch = MARC::Batch->new('XML', $input);
- while ( my $record = $batch->next ) {
- $count++;
- my @warnings = $batch->warnings;
- print STDERR "WARNINGS: Record $count : ",
- join(":",@warnings), " : continuing...\n"
- if ( @warnings );
-
- while ($record->field($conf->{tag}))
- { $record->delete_field( $record->field($conf->{tag}) ) }
- my $new_id = $conf->{'renumber-from'} + $count - 1;
- my $new_id_field = MARC::Field->new( $conf->{tag},
- ' ',
- ' ',
- $conf->{subfield} => $new_id );
- $record->append_fields($new_id_field);
- print RENUMBER $record->as_xml;
- print STDERR "\rLast record: $count";
- }
- print STDERR "\rProcessed $count records. Last record id at ",
- ($conf->{'renumber-from'} + $count - 1), "\n";
-}
-
-
-=head2 initialize
-
-Performs boring script initialization. Handles argument parsing,
-mostly.
-
-=cut
-
-sub initialize {
- my ($c) = @_;
- my @missing = ();
-
- # set mode on existing filehandles
- binmode(STDIN, ':utf8');
-
- my $rc = GetOptions( $c,
- 'incoming',
- 'incumbent',
- 'output|o=s',
- 'renumber-from|rf=i',
- 'subfield|s=s',
- 'tag|t=s',
- 'help|h',
- );
- show_help() unless $rc;
- show_help() if ($c->{help});
-
- # set defaults if told to do so
- if ($c->{incoming}) {
- $c->{tag} = 903 unless defined $c->{tag};
- $c->{subfield} = 'a' unless defined $c->{subfield};
- $c->{output} = 'incoming.renumbered.marc.xml'
- unless defined $c->{output};
- } elsif ($c->{incumbent}) {
- $c->{tag} = 901 unless defined $c->{tag};
- $c->{subfield} = 'c' unless defined $c->{subfield};
- $c->{output} = 'incumbent.renumbered.marc.xml'
- unless defined $c->{output};
- }
-
- my @keys = keys %{$c};
- show_help() unless (@ARGV and @keys);
- for my $key ('renumber-from', 'tag', 'subfield', 'output')
- { push @missing, $key unless $c->{$key} }
- if (@missing) {
- print "Required option: ", join(', ', @missing), " missing!\n";
- show_help();
- }
-
-}
-
-
-=head2 show_help
-
-Display usage message when things go wrong
-
-=cut
-
-sub show_help {
-print <<HELP;
-Usage is: $0 [REQUIRED ARGS] <filelist>
-Req'd Arguments
- --renumber-from=N -rf First id# of new sequence
- --tag=N -t Which tag to use
- --subfield=X -s Which subfield to use
- --output=<file> -o Output filename
-Options
- --incoming Set -t, -s, -o to incoming defaults
- --incumbent Set -t, -s, -o to incumbent defaults
-
- Example: '$0 --incoming' is equivalent to
- '$0 -t 903 -s a -o incoming.renumbered.marc.xml'
-
-Any number of input files may be specified; one output file will result.
-HELP
-exit 1;
-}