From f8b80bf0cf74b30b0440791b01c5dcc4448f13b2 Mon Sep 17 00:00:00 2001 From: Shawn Boyette Date: Tue, 17 Mar 2009 16:56:33 +0000 Subject: [PATCH] moving stuff, again --- oldstyle/filter_record.ids | 122 ----------------------------------- oldstyle/match_fingerprints.pl | 139 ---------------------------------------- oldstyle/renumber_marc | 125 ------------------------------------ 3 files changed, 0 insertions(+), 386 deletions(-) delete mode 100644 oldstyle/filter_record.ids delete mode 100755 oldstyle/match_fingerprints.pl delete mode 100755 oldstyle/renumber_marc diff --git a/oldstyle/filter_record.ids b/oldstyle/filter_record.ids deleted file mode 100644 index 46aa622..0000000 --- a/oldstyle/filter_record.ids +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/perl -use warnings; -use strict; - -use Getopt::Long; - -# configuration hashref -my $conf = {}; -initialize($conf); - -my %id; - -open F, "<", $conf->{idfile}; -while () { - chomp; - $id{$_} = 1; -} -close F; - -my $M; my $I; my $S; -open $M, '<:utf8', $conf->{marcfile} - or die "Can't open marcfile '",$conf->{marcfile},"'\n"; -open $I, '>:utf8', $conf->{'output-import'} - or die "Can't open import file '",$conf->{'output-import'},"'\n"; -open $S, '>:utf8', $conf->{'output-shelved'} - or die "Can't open shelf file '",$conf->{'output-shelve'},"'\n"; - -while (<$M>) { - my $tag = $conf->{tag}; - my $sub = $conf->{subfield}; - - /tag="$tag" ind1=" " ind2=" ">.*?(\d+){mode} eq "exclude") { - print $S $_ if ($id{$1}); - print $I $_ unless ($id{$1});; - } else { - print $S $_ unless ($id{$1}); - print $I $_ if ($id{$1});; - } - $conf->{count}++; - - unless ($conf->{count} % 100) { - print STDERR "\rProcessed: ",$conf->{count}; - } -} - -=head2 initialize - -Performs boring script initialization. Handles argument parsing, -mostly. - -=cut - -sub initialize { - my ($c) = @_; - my @missing = (); - - # set mode on existing filehandles - binmode(STDIN, ':utf8'); - - my $rc = GetOptions( $c, - 'mode=s', - 'include', - 'tag|t=i', - 'subfield|s=s', - 'idfile|i=s', - 'marcfile|m=s', - 'output-import|oi=s', - 'output-shelved|os=s', - 'help|h', - ); - show_help() unless $rc; - show_help() if ($c->{help}); - - my @keys = keys %{$c}; - for my $key ('mode', 'idfile', 'marcfile', 'tag', 'subfield', - 'output-import', 'output-shelved') - { push @missing, $key unless $c->{$key} } - if (@missing) { - print "Required option: ", join(', ', @missing), " missing!\n"; - show_help(); - } - unless ($c->{mode} eq "include" or $c->{mode} eq "exclude") { - print "Unknown run mode '", $c->{mode}, "'\n"; - show_help(); - } -} - - -=head2 show_help - -Display usage message when things go wrong - -=cut - -sub show_help { -print <) { - chomp $line; - my @fields = split(/\t/,$line); - my $id = shift @fields; - my $fp = join '^', @fields; - if (! defined $pines{ $fp }) { $pines{ $fp } = []; } - push @{ $pines{ $fp } }, $id; -} -close FILE; - -# do the same for incoming -open FILE, $match_these; -while (my $line = ) { - chomp $line; - my @fields = split(/\t/,$line); - my $id = shift @fields; - my $fp = join '^', @fields; - if (! defined $incoming{ $fp }) { $incoming{ $fp } = []; } - push @{ $incoming{ $fp } }, $id; -} -close FILE; - -# scoring file stuffs, which i have never used -foreach my $file ( $match_to_score, $match_from_score ) { - open FILE, $file; - while (my $line = ) { - chomp $line; - my @fields = split(/\|/,$line); - my $id = shift @fields; $id =~ s/\D//g; - my $holdings = shift @fields; $holdings =~ s/\D//g; - my $subtitle = shift @fields; $subtitle =~ s/^\s+//; $subtitle =~ s/\s+$//; - $score{ $id } = [ $holdings, $subtitle ]; - } - close FILE; -} - - -open RECORD_IDS, ">match.record_ids"; -foreach my $fp ( keys %incoming ) { - # for each incoming fingerprint, - if (defined $pines{ $fp }) { - # if there is a matching incumbent fingerprint - foreach my $id ( @{ $incoming{ $fp } } ) { - # print all incoming record ids - print RECORD_IDS "$id\n"; - if ( ! defined $candidate_match{ $id } ) - # and create a mapping of incoming ids to fingerprints - # (used for scoring) - { $candidate_match{ $id } = []; } - push @{ $candidate_match{ $id } }, $fp; - } - } -} -close RECORD_IDS; - - -# scoring section, which i have never used -foreach my $id ( keys %candidate_match ) { - my $subtitle; - # if score{id} exists set subtitle to the sc - if (defined $score{ $id }) - { $subtitle = $score{ $id }[1]; } - - my @fps = @{ $candidate_match{ $id } }; - my @candidate_pines = (); - - my $subtitle_matched = 0; - my $highest_holdings = 0; - my $best_pines_id; - - foreach my $fp ( @fps ) { - foreach my $pines_id ( @{ $pines{ $fp } } ) { - my $pines_subtitle; - if (defined $score{ $pines_id }) - { $pines_subtitle = $score{ $pines_id }[1]; } - my $pines_holdings; - if (defined $score{ $pines_id }) - { $pines_holdings = $score{ $pines_id }[0]; } - if ($pines_subtitle eq $subtitle) { - if (! $subtitle_matched) { - $subtitle_matched = 1; - $best_pines_id = $pines_id; - $highest_holdings = -1; - } - } else { - if ($subtitle_matched) { next; } - } - if ( $pines_holdings > $highest_holdings ) { - $highest_holdings = $pines_holdings; - $best_pines_id = $pines_id; - } - } - } - - # this will silently fail, as the filehandle has been closed, but strict - # is not enabled - print RECORD_IDS "$best_pines_id\n"; - - if (! defined $match{ $best_pines_id } ) - { $match{ $best_pines_id } = [ $best_pines_id ]; } - push @{ $match{ $best_pines_id } }, $id; -} - - - -open GROUPINGS, ">match.groupings"; -foreach my $k ( keys %match ) { - print GROUPINGS join("^", - "checking", - $dataset, - $match{ $k }[0], - join(",",@{ $match{ $k } }), - join(",",@{ $match{ $k } }) - ) . "\n"; - -} -close GROUPINGS; - - diff --git a/oldstyle/renumber_marc b/oldstyle/renumber_marc deleted file mode 100755 index fe2bfa7..0000000 --- a/oldstyle/renumber_marc +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; - -use MARC::Batch; -use Getopt::Long; -#use MARC::Record; -use MARC::File::XML ( BinaryEncoding => 'utf-8' ); -#use MARC::Field; - -$| = 1; - -my $count = 0; -my $conf = {}; # configuration hashref -initialize($conf); - -binmode(STDIN, ':utf8'); - -open RENUMBER, '>', $conf->{output}; -binmode(RENUMBER, ':utf8'); - -foreach my $input ( @ARGV ) { - print STDERR "Processing $input, starting record id at ", - $conf->{'renumber-from'},"\n"; - - my $batch = MARC::Batch->new('XML', $input); - while ( my $record = $batch->next ) { - $count++; - my @warnings = $batch->warnings; - print STDERR "WARNINGS: Record $count : ", - join(":",@warnings), " : continuing...\n" - if ( @warnings ); - - while ($record->field($conf->{tag})) - { $record->delete_field( $record->field($conf->{tag}) ) } - my $new_id = $conf->{'renumber-from'} + $count - 1; - my $new_id_field = MARC::Field->new( $conf->{tag}, - ' ', - ' ', - $conf->{subfield} => $new_id ); - $record->append_fields($new_id_field); - print RENUMBER $record->as_xml; - print STDERR "\rLast record: $count"; - } - print STDERR "\rProcessed $count records. Last record id at ", - ($conf->{'renumber-from'} + $count - 1), "\n"; -} - - -=head2 initialize - -Performs boring script initialization. Handles argument parsing, -mostly. - -=cut - -sub initialize { - my ($c) = @_; - my @missing = (); - - # set mode on existing filehandles - binmode(STDIN, ':utf8'); - - my $rc = GetOptions( $c, - 'incoming', - 'incumbent', - 'output|o=s', - 'renumber-from|rf=i', - 'subfield|s=s', - 'tag|t=s', - 'help|h', - ); - show_help() unless $rc; - show_help() if ($c->{help}); - - # set defaults if told to do so - if ($c->{incoming}) { - $c->{tag} = 903 unless defined $c->{tag}; - $c->{subfield} = 'a' unless defined $c->{subfield}; - $c->{output} = 'incoming.renumbered.marc.xml' - unless defined $c->{output}; - } elsif ($c->{incumbent}) { - $c->{tag} = 901 unless defined $c->{tag}; - $c->{subfield} = 'c' unless defined $c->{subfield}; - $c->{output} = 'incumbent.renumbered.marc.xml' - unless defined $c->{output}; - } - - my @keys = keys %{$c}; - show_help() unless (@ARGV and @keys); - for my $key ('renumber-from', 'tag', 'subfield', 'output') - { push @missing, $key unless $c->{$key} } - if (@missing) { - print "Required option: ", join(', ', @missing), " missing!\n"; - show_help(); - } - -} - - -=head2 show_help - -Display usage message when things go wrong - -=cut - -sub show_help { -print < -Req'd Arguments - --renumber-from=N -rf First id# of new sequence - --tag=N -t Which tag to use - --subfield=X -s Which subfield to use - --output= -o Output filename -Options - --incoming Set -t, -s, -o to incoming defaults - --incumbent Set -t, -s, -o to incumbent defaults - - Example: '$0 --incoming' is equivalent to - '$0 -t 903 -s a -o incoming.renumbered.marc.xml' - -Any number of input files may be specified; one output file will result. -HELP -exit 1; -} -- 1.7.2.5