From 6424b12b22cfd4bc06562f26b1b0498801a4f976 Mon Sep 17 00:00:00 2001 From: Shawn Boyette Date: Fri, 19 Dec 2008 16:53:45 +0000 Subject: [PATCH] more cleanup and organization --- filter_record.ids | 122 ------------------------------ ils-specific/parse-tlc-items-out.pl | 48 ++++++++++++ match_fingerprints.pl | 139 ----------------------------------- oldstyle/filter_record.ids | 122 ++++++++++++++++++++++++++++++ oldstyle/match_fingerprints.pl | 139 +++++++++++++++++++++++++++++++++++ oldstyle/renumber_marc | 125 +++++++++++++++++++++++++++++++ parse-tlc-items-out.pl | 48 ------------ renumber_marc | 125 ------------------------------- 8 files changed, 434 insertions(+), 434 deletions(-) delete mode 100644 filter_record.ids create mode 100644 ils-specific/parse-tlc-items-out.pl delete mode 100755 match_fingerprints.pl create mode 100644 oldstyle/filter_record.ids create mode 100755 oldstyle/match_fingerprints.pl create mode 100755 oldstyle/renumber_marc delete mode 100644 parse-tlc-items-out.pl delete mode 100755 renumber_marc diff --git a/filter_record.ids b/filter_record.ids deleted file mode 100644 index 46aa622..0000000 --- a/filter_record.ids +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/perl -use warnings; -use strict; - -use Getopt::Long; - -# configuration hashref -my $conf = {}; -initialize($conf); - -my %id; - -open F, "<", $conf->{idfile}; -while () { - chomp; - $id{$_} = 1; -} -close F; - -my $M; my $I; my $S; -open $M, '<:utf8', $conf->{marcfile} - or die "Can't open marcfile '",$conf->{marcfile},"'\n"; -open $I, '>:utf8', $conf->{'output-import'} - or die "Can't open import file '",$conf->{'output-import'},"'\n"; -open $S, '>:utf8', $conf->{'output-shelved'} - or die "Can't open shelf file '",$conf->{'output-shelve'},"'\n"; - -while (<$M>) { - my $tag = $conf->{tag}; - my $sub = $conf->{subfield}; - - /tag="$tag" ind1=" " ind2=" ">.*?(\d+){mode} eq "exclude") { - print $S $_ if ($id{$1}); - print $I $_ unless ($id{$1});; - } else { - print $S $_ unless ($id{$1}); - print $I $_ if ($id{$1});; - } - $conf->{count}++; - - unless ($conf->{count} % 100) { - print STDERR "\rProcessed: ",$conf->{count}; - } -} - -=head2 initialize - -Performs boring script initialization. Handles argument parsing, -mostly. - -=cut - -sub initialize { - my ($c) = @_; - my @missing = (); - - # set mode on existing filehandles - binmode(STDIN, ':utf8'); - - my $rc = GetOptions( $c, - 'mode=s', - 'include', - 'tag|t=i', - 'subfield|s=s', - 'idfile|i=s', - 'marcfile|m=s', - 'output-import|oi=s', - 'output-shelved|os=s', - 'help|h', - ); - show_help() unless $rc; - show_help() if ($c->{help}); - - my @keys = keys %{$c}; - for my $key ('mode', 'idfile', 'marcfile', 'tag', 'subfield', - 'output-import', 'output-shelved') - { push @missing, $key unless $c->{$key} } - if (@missing) { - print "Required option: ", join(', ', @missing), " missing!\n"; - show_help(); - } - unless ($c->{mode} eq "include" or $c->{mode} eq "exclude") { - print "Unknown run mode '", $c->{mode}, "'\n"; - show_help(); - } -} - - -=head2 show_help - -Display usage message when things go wrong - -=cut - -sub show_help { -print <) { + chomp; + my @fields = split /\t/; + + if ( (!$state || $state eq 'item' || $state eq 'none') && $fields[$f] eq 'Borrower ID') { + $state = 'borrower'; + next; + } + + if ($state eq 'borrower') { + $patron = $fields[$f]; + $state = 'none'; + next; + } + + if ($state eq 'none' && $fields[$b] eq 'Item ID') { + $state = 'item'; + next; + } + + if ($state eq 'item' && $fields[$b] =~ /^\d+$/o) { + $item = $fields[$b]; + if ($fields[$f] =~ /^(\d+)\/(\d+)\/(\d+)$/) { + $out = sprintf('%04d-%02d-%02d', 2000 + $3, $1, $2); + } + if ($fields[$e] =~ /^(\d+)\/(\d+)\/(\d+)$/) { + $due = sprintf('%04d-%02d-%02d', 2000 + $3, $1, $2); + } + ($price = $fields[$g]) =~ s/\*//go; + print join("\t", $patron, $item, $out, $due, $price) . "\n"; + } +} diff --git a/match_fingerprints.pl b/match_fingerprints.pl deleted file mode 100755 index 5bc66f7..0000000 --- a/match_fingerprints.pl +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/perl - -my $dataset = $ARGV[0]; - -my $match_to = $ARGV[1]; -my $match_these = $ARGV[2]; -my $match_to_score = $ARGV[3]; -my $match_these_score = $ARGV[4]; - -print "match_to: $match_to match_these: $match_these\n"; - -my %pines; -my %incoming; -my %match; -my %candidate_match; -my %score; - - -# create HOL of incumbent{fingerprint}[ids] -open FILE, $match_to; -while (my $line = ) { - chomp $line; - my @fields = split(/\t/,$line); - my $id = shift @fields; - my $fp = join '^', @fields; - if (! defined $pines{ $fp }) { $pines{ $fp } = []; } - push @{ $pines{ $fp } }, $id; -} -close FILE; - -# do the same for incoming -open FILE, $match_these; -while (my $line = ) { - chomp $line; - my @fields = split(/\t/,$line); - my $id = shift @fields; - my $fp = join '^', @fields; - if (! defined $incoming{ $fp }) { $incoming{ $fp } = []; } - push @{ $incoming{ $fp } }, $id; -} -close FILE; - -# scoring file stuffs, which i have never used -foreach my $file ( $match_to_score, $match_from_score ) { - open FILE, $file; - while (my $line = ) { - chomp $line; - my @fields = split(/\|/,$line); - my $id = shift @fields; $id =~ s/\D//g; - my $holdings = shift @fields; $holdings =~ s/\D//g; - my $subtitle = shift @fields; $subtitle =~ s/^\s+//; $subtitle =~ s/\s+$//; - $score{ $id } = [ $holdings, $subtitle ]; - } - close FILE; -} - - -open RECORD_IDS, ">match.record_ids"; -foreach my $fp ( keys %incoming ) { - # for each incoming fingerprint, - if (defined $pines{ $fp }) { - # if there is a matching incumbent fingerprint - foreach my $id ( @{ $incoming{ $fp } } ) { - # print all incoming record ids - print RECORD_IDS "$id\n"; - if ( ! defined $candidate_match{ $id } ) - # and create a mapping of incoming ids to fingerprints - # (used for scoring) - { $candidate_match{ $id } = []; } - push @{ $candidate_match{ $id } }, $fp; - } - } -} -close RECORD_IDS; - - -# scoring section, which i have never used -foreach my $id ( keys %candidate_match ) { - my $subtitle; - # if score{id} exists set subtitle to the sc - if (defined $score{ $id }) - { $subtitle = $score{ $id }[1]; } - - my @fps = @{ $candidate_match{ $id } }; - my @candidate_pines = (); - - my $subtitle_matched = 0; - my $highest_holdings = 0; - my $best_pines_id; - - foreach my $fp ( @fps ) { - foreach my $pines_id ( @{ $pines{ $fp } } ) { - my $pines_subtitle; - if (defined $score{ $pines_id }) - { $pines_subtitle = $score{ $pines_id }[1]; } - my $pines_holdings; - if (defined $score{ $pines_id }) - { $pines_holdings = $score{ $pines_id }[0]; } - if ($pines_subtitle eq $subtitle) { - if (! $subtitle_matched) { - $subtitle_matched = 1; - $best_pines_id = $pines_id; - $highest_holdings = -1; - } - } else { - if ($subtitle_matched) { next; } - } - if ( $pines_holdings > $highest_holdings ) { - $highest_holdings = $pines_holdings; - $best_pines_id = $pines_id; - } - } - } - - # this will silently fail, as the filehandle has been closed, but strict - # is not enabled - print RECORD_IDS "$best_pines_id\n"; - - if (! defined $match{ $best_pines_id } ) - { $match{ $best_pines_id } = [ $best_pines_id ]; } - push @{ $match{ $best_pines_id } }, $id; -} - - - -open GROUPINGS, ">match.groupings"; -foreach my $k ( keys %match ) { - print GROUPINGS join("^", - "checking", - $dataset, - $match{ $k }[0], - join(",",@{ $match{ $k } }), - join(",",@{ $match{ $k } }) - ) . "\n"; - -} -close GROUPINGS; - - diff --git a/oldstyle/filter_record.ids b/oldstyle/filter_record.ids new file mode 100644 index 0000000..46aa622 --- /dev/null +++ b/oldstyle/filter_record.ids @@ -0,0 +1,122 @@ +#!/usr/bin/perl +use warnings; +use strict; + +use Getopt::Long; + +# configuration hashref +my $conf = {}; +initialize($conf); + +my %id; + +open F, "<", $conf->{idfile}; +while () { + chomp; + $id{$_} = 1; +} +close F; + +my $M; my $I; my $S; +open $M, '<:utf8', $conf->{marcfile} + or die "Can't open marcfile '",$conf->{marcfile},"'\n"; +open $I, '>:utf8', $conf->{'output-import'} + or die "Can't open import file '",$conf->{'output-import'},"'\n"; +open $S, '>:utf8', $conf->{'output-shelved'} + or die "Can't open shelf file '",$conf->{'output-shelve'},"'\n"; + +while (<$M>) { + my $tag = $conf->{tag}; + my $sub = $conf->{subfield}; + + /tag="$tag" ind1=" " ind2=" ">.*?(\d+){mode} eq "exclude") { + print $S $_ if ($id{$1}); + print $I $_ unless ($id{$1});; + } else { + print $S $_ unless ($id{$1}); + print $I $_ if ($id{$1});; + } + $conf->{count}++; + + unless ($conf->{count} % 100) { + print STDERR "\rProcessed: ",$conf->{count}; + } +} + +=head2 initialize + +Performs boring script initialization. Handles argument parsing, +mostly. + +=cut + +sub initialize { + my ($c) = @_; + my @missing = (); + + # set mode on existing filehandles + binmode(STDIN, ':utf8'); + + my $rc = GetOptions( $c, + 'mode=s', + 'include', + 'tag|t=i', + 'subfield|s=s', + 'idfile|i=s', + 'marcfile|m=s', + 'output-import|oi=s', + 'output-shelved|os=s', + 'help|h', + ); + show_help() unless $rc; + show_help() if ($c->{help}); + + my @keys = keys %{$c}; + for my $key ('mode', 'idfile', 'marcfile', 'tag', 'subfield', + 'output-import', 'output-shelved') + { push @missing, $key unless $c->{$key} } + if (@missing) { + print "Required option: ", join(', ', @missing), " missing!\n"; + show_help(); + } + unless ($c->{mode} eq "include" or $c->{mode} eq "exclude") { + print "Unknown run mode '", $c->{mode}, "'\n"; + show_help(); + } +} + + +=head2 show_help + +Display usage message when things go wrong + +=cut + +sub show_help { +print <) { + chomp $line; + my @fields = split(/\t/,$line); + my $id = shift @fields; + my $fp = join '^', @fields; + if (! defined $pines{ $fp }) { $pines{ $fp } = []; } + push @{ $pines{ $fp } }, $id; +} +close FILE; + +# do the same for incoming +open FILE, $match_these; +while (my $line = ) { + chomp $line; + my @fields = split(/\t/,$line); + my $id = shift @fields; + my $fp = join '^', @fields; + if (! defined $incoming{ $fp }) { $incoming{ $fp } = []; } + push @{ $incoming{ $fp } }, $id; +} +close FILE; + +# scoring file stuffs, which i have never used +foreach my $file ( $match_to_score, $match_from_score ) { + open FILE, $file; + while (my $line = ) { + chomp $line; + my @fields = split(/\|/,$line); + my $id = shift @fields; $id =~ s/\D//g; + my $holdings = shift @fields; $holdings =~ s/\D//g; + my $subtitle = shift @fields; $subtitle =~ s/^\s+//; $subtitle =~ s/\s+$//; + $score{ $id } = [ $holdings, $subtitle ]; + } + close FILE; +} + + +open RECORD_IDS, ">match.record_ids"; +foreach my $fp ( keys %incoming ) { + # for each incoming fingerprint, + if (defined $pines{ $fp }) { + # if there is a matching incumbent fingerprint + foreach my $id ( @{ $incoming{ $fp } } ) { + # print all incoming record ids + print RECORD_IDS "$id\n"; + if ( ! defined $candidate_match{ $id } ) + # and create a mapping of incoming ids to fingerprints + # (used for scoring) + { $candidate_match{ $id } = []; } + push @{ $candidate_match{ $id } }, $fp; + } + } +} +close RECORD_IDS; + + +# scoring section, which i have never used +foreach my $id ( keys %candidate_match ) { + my $subtitle; + # if score{id} exists set subtitle to the sc + if (defined $score{ $id }) + { $subtitle = $score{ $id }[1]; } + + my @fps = @{ $candidate_match{ $id } }; + my @candidate_pines = (); + + my $subtitle_matched = 0; + my $highest_holdings = 0; + my $best_pines_id; + + foreach my $fp ( @fps ) { + foreach my $pines_id ( @{ $pines{ $fp } } ) { + my $pines_subtitle; + if (defined $score{ $pines_id }) + { $pines_subtitle = $score{ $pines_id }[1]; } + my $pines_holdings; + if (defined $score{ $pines_id }) + { $pines_holdings = $score{ $pines_id }[0]; } + if ($pines_subtitle eq $subtitle) { + if (! $subtitle_matched) { + $subtitle_matched = 1; + $best_pines_id = $pines_id; + $highest_holdings = -1; + } + } else { + if ($subtitle_matched) { next; } + } + if ( $pines_holdings > $highest_holdings ) { + $highest_holdings = $pines_holdings; + $best_pines_id = $pines_id; + } + } + } + + # this will silently fail, as the filehandle has been closed, but strict + # is not enabled + print RECORD_IDS "$best_pines_id\n"; + + if (! defined $match{ $best_pines_id } ) + { $match{ $best_pines_id } = [ $best_pines_id ]; } + push @{ $match{ $best_pines_id } }, $id; +} + + + +open GROUPINGS, ">match.groupings"; +foreach my $k ( keys %match ) { + print GROUPINGS join("^", + "checking", + $dataset, + $match{ $k }[0], + join(",",@{ $match{ $k } }), + join(",",@{ $match{ $k } }) + ) . "\n"; + +} +close GROUPINGS; + + diff --git a/oldstyle/renumber_marc b/oldstyle/renumber_marc new file mode 100755 index 0000000..fe2bfa7 --- /dev/null +++ b/oldstyle/renumber_marc @@ -0,0 +1,125 @@ +#!/usr/bin/perl +use strict; +use warnings; + +use MARC::Batch; +use Getopt::Long; +#use MARC::Record; +use MARC::File::XML ( BinaryEncoding => 'utf-8' ); +#use MARC::Field; + +$| = 1; + +my $count = 0; +my $conf = {}; # configuration hashref +initialize($conf); + +binmode(STDIN, ':utf8'); + +open RENUMBER, '>', $conf->{output}; +binmode(RENUMBER, ':utf8'); + +foreach my $input ( @ARGV ) { + print STDERR "Processing $input, starting record id at ", + $conf->{'renumber-from'},"\n"; + + my $batch = MARC::Batch->new('XML', $input); + while ( my $record = $batch->next ) { + $count++; + my @warnings = $batch->warnings; + print STDERR "WARNINGS: Record $count : ", + join(":",@warnings), " : continuing...\n" + if ( @warnings ); + + while ($record->field($conf->{tag})) + { $record->delete_field( $record->field($conf->{tag}) ) } + my $new_id = $conf->{'renumber-from'} + $count - 1; + my $new_id_field = MARC::Field->new( $conf->{tag}, + ' ', + ' ', + $conf->{subfield} => $new_id ); + $record->append_fields($new_id_field); + print RENUMBER $record->as_xml; + print STDERR "\rLast record: $count"; + } + print STDERR "\rProcessed $count records. Last record id at ", + ($conf->{'renumber-from'} + $count - 1), "\n"; +} + + +=head2 initialize + +Performs boring script initialization. Handles argument parsing, +mostly. + +=cut + +sub initialize { + my ($c) = @_; + my @missing = (); + + # set mode on existing filehandles + binmode(STDIN, ':utf8'); + + my $rc = GetOptions( $c, + 'incoming', + 'incumbent', + 'output|o=s', + 'renumber-from|rf=i', + 'subfield|s=s', + 'tag|t=s', + 'help|h', + ); + show_help() unless $rc; + show_help() if ($c->{help}); + + # set defaults if told to do so + if ($c->{incoming}) { + $c->{tag} = 903 unless defined $c->{tag}; + $c->{subfield} = 'a' unless defined $c->{subfield}; + $c->{output} = 'incoming.renumbered.marc.xml' + unless defined $c->{output}; + } elsif ($c->{incumbent}) { + $c->{tag} = 901 unless defined $c->{tag}; + $c->{subfield} = 'c' unless defined $c->{subfield}; + $c->{output} = 'incumbent.renumbered.marc.xml' + unless defined $c->{output}; + } + + my @keys = keys %{$c}; + show_help() unless (@ARGV and @keys); + for my $key ('renumber-from', 'tag', 'subfield', 'output') + { push @missing, $key unless $c->{$key} } + if (@missing) { + print "Required option: ", join(', ', @missing), " missing!\n"; + show_help(); + } + +} + + +=head2 show_help + +Display usage message when things go wrong + +=cut + +sub show_help { +print < +Req'd Arguments + --renumber-from=N -rf First id# of new sequence + --tag=N -t Which tag to use + --subfield=X -s Which subfield to use + --output= -o Output filename +Options + --incoming Set -t, -s, -o to incoming defaults + --incumbent Set -t, -s, -o to incumbent defaults + + Example: '$0 --incoming' is equivalent to + '$0 -t 903 -s a -o incoming.renumbered.marc.xml' + +Any number of input files may be specified; one output file will result. +HELP +exit 1; +} diff --git a/parse-tlc-items-out.pl b/parse-tlc-items-out.pl deleted file mode 100644 index 2cee130..0000000 --- a/parse-tlc-items-out.pl +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/perl - -# This parses TCL items-out reports converted from excel to csv, turning them -# into a tab separated file. arg! - -my $state; -my $patron; -my $item; -my $out; -my $due; -my $price; - -my ($a,$b,$c,$d,$e,$f,$g) = (0,1,2,3,4,5,6,7); - -print "patron\titem\tout\tdue\tprice\n"; - -while (<>) { - chomp; - my @fields = split /\t/; - - if ( (!$state || $state eq 'item' || $state eq 'none') && $fields[$f] eq 'Borrower ID') { - $state = 'borrower'; - next; - } - - if ($state eq 'borrower') { - $patron = $fields[$f]; - $state = 'none'; - next; - } - - if ($state eq 'none' && $fields[$b] eq 'Item ID') { - $state = 'item'; - next; - } - - if ($state eq 'item' && $fields[$b] =~ /^\d+$/o) { - $item = $fields[$b]; - if ($fields[$f] =~ /^(\d+)\/(\d+)\/(\d+)$/) { - $out = sprintf('%04d-%02d-%02d', 2000 + $3, $1, $2); - } - if ($fields[$e] =~ /^(\d+)\/(\d+)\/(\d+)$/) { - $due = sprintf('%04d-%02d-%02d', 2000 + $3, $1, $2); - } - ($price = $fields[$g]) =~ s/\*//go; - print join("\t", $patron, $item, $out, $due, $price) . "\n"; - } -} diff --git a/renumber_marc b/renumber_marc deleted file mode 100755 index fe2bfa7..0000000 --- a/renumber_marc +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/perl -use strict; -use warnings; - -use MARC::Batch; -use Getopt::Long; -#use MARC::Record; -use MARC::File::XML ( BinaryEncoding => 'utf-8' ); -#use MARC::Field; - -$| = 1; - -my $count = 0; -my $conf = {}; # configuration hashref -initialize($conf); - -binmode(STDIN, ':utf8'); - -open RENUMBER, '>', $conf->{output}; -binmode(RENUMBER, ':utf8'); - -foreach my $input ( @ARGV ) { - print STDERR "Processing $input, starting record id at ", - $conf->{'renumber-from'},"\n"; - - my $batch = MARC::Batch->new('XML', $input); - while ( my $record = $batch->next ) { - $count++; - my @warnings = $batch->warnings; - print STDERR "WARNINGS: Record $count : ", - join(":",@warnings), " : continuing...\n" - if ( @warnings ); - - while ($record->field($conf->{tag})) - { $record->delete_field( $record->field($conf->{tag}) ) } - my $new_id = $conf->{'renumber-from'} + $count - 1; - my $new_id_field = MARC::Field->new( $conf->{tag}, - ' ', - ' ', - $conf->{subfield} => $new_id ); - $record->append_fields($new_id_field); - print RENUMBER $record->as_xml; - print STDERR "\rLast record: $count"; - } - print STDERR "\rProcessed $count records. Last record id at ", - ($conf->{'renumber-from'} + $count - 1), "\n"; -} - - -=head2 initialize - -Performs boring script initialization. Handles argument parsing, -mostly. - -=cut - -sub initialize { - my ($c) = @_; - my @missing = (); - - # set mode on existing filehandles - binmode(STDIN, ':utf8'); - - my $rc = GetOptions( $c, - 'incoming', - 'incumbent', - 'output|o=s', - 'renumber-from|rf=i', - 'subfield|s=s', - 'tag|t=s', - 'help|h', - ); - show_help() unless $rc; - show_help() if ($c->{help}); - - # set defaults if told to do so - if ($c->{incoming}) { - $c->{tag} = 903 unless defined $c->{tag}; - $c->{subfield} = 'a' unless defined $c->{subfield}; - $c->{output} = 'incoming.renumbered.marc.xml' - unless defined $c->{output}; - } elsif ($c->{incumbent}) { - $c->{tag} = 901 unless defined $c->{tag}; - $c->{subfield} = 'c' unless defined $c->{subfield}; - $c->{output} = 'incumbent.renumbered.marc.xml' - unless defined $c->{output}; - } - - my @keys = keys %{$c}; - show_help() unless (@ARGV and @keys); - for my $key ('renumber-from', 'tag', 'subfield', 'output') - { push @missing, $key unless $c->{$key} } - if (@missing) { - print "Required option: ", join(', ', @missing), " missing!\n"; - show_help(); - } - -} - - -=head2 show_help - -Display usage message when things go wrong - -=cut - -sub show_help { -print < -Req'd Arguments - --renumber-from=N -rf First id# of new sequence - --tag=N -t Which tag to use - --subfield=X -s Which subfield to use - --output= -o Output filename -Options - --incoming Set -t, -s, -o to incoming defaults - --incumbent Set -t, -s, -o to incumbent defaults - - Example: '$0 --incoming' is equivalent to - '$0 -t 903 -s a -o incoming.renumbered.marc.xml' - -Any number of input files may be specified; one output file will result. -HELP -exit 1; -} -- 1.7.2.5