X-Git-Url: http://git.equinoxoli.org/?p=migration-tools.git;a=blobdiff_plain;f=mig-bin%2Fmig-bibstats;fp=mig-bin%2Fmig-bibstats;h=0000000000000000000000000000000000000000;hp=e0db26603727667e6e42a34c936d5dca997fd8ca;hb=155eb9eac077ca803f75d1295e584e7012e1b883;hpb=69588457ab8f70fbb77af29cc0653933d24ed2ac diff --git a/mig-bin/mig-bibstats b/mig-bin/mig-bibstats deleted file mode 100755 index e0db266..0000000 --- a/mig-bin/mig-bibstats +++ /dev/null @@ -1,206 +0,0 @@ -#!/usr/bin/perl -# -*- coding: iso-8859-15 -*- -############################################################################### -=pod - -=item B --file foo.mrc - -Reads through a marc file to generate statistical information about the file -for quick analysis. - ---uri_threshold defaults to 1, only shows URI values with more than that -frequency - ---ingore_filetype true will have it not care what file returns as the type and -always treat it as marc21 -=back - -=cut - -############################################################################### - -use strict; -use warnings; - -use Data::Dumper; -use Env qw( - HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA - MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR -); -use Pod::Usage; -use Switch; -use Getopt::Long; -use MARC::Batch; -use MARC::Record; -use MARC::Field; -use Cwd 'abs_path'; -use Cwd qw(getcwd); -use FindBin; -my $mig_bin = "$FindBin::Bin/"; -use lib "$FindBin::Bin/"; -use Mig; -use open ':encoding(utf8)'; - -pod2usage(-verbose => 2) if defined $ARGV[0] && $ARGV[0] eq '--help'; -pod2usage(-verbose => 1) if ! $ARGV[1]; - -my $file; -my $uri_threshold = 1; -my $p_holding_code; -my $p_barcode_subfield; -my $p_ils_name = 'Runtime ILS'; -my $holding_threshold = 50; -my $p_ignore_filetype = 'false'; - -my $ret = GetOptions( - 'file:s' => \$file, - 'uri_threshold:i' => \$uri_threshold, - 'holding_code:s' => \$p_holding_code, - 'barcode:s' => \$p_barcode_subfield, - 'ignore_filetype:s' => \$p_ignore_filetype, - 'ils_name:s' => \$p_ils_name, - 'holding_threshold:s' => \$holding_threshold -); - -if ($p_holding_code and length $p_holding_code != 3) { abort('Holdings codes must be three characters.'); } - -if ($p_barcode_subfield) { - if (!defined $p_holding_code) { abort('A barcode field can not be used without a holding code.'); } - if (length $p_barcode_subfield != 1) { abort('Barcode subfields must be a single character code.'); } -} - -my @ilses = ( - ['Mandarin','852','p'], - ['Evergreen','852','p'], - ['Polaris','852','p'], - ['TLC','949','g'], - ['Koha','952','p'], - ['Sympony','999','i'] -); - -my @temp; -if ($p_holding_code) { - push @temp, $p_ils_name; - push @temp, $p_holding_code; - if ($p_barcode_subfield) { push @temp, lc $p_barcode_subfield; } -} -push @ilses, @temp; - - - -my $batch = MARC::Batch->new('USMARC', $file); -$batch->strict_off(); -my $filetype = `file $file`; -if ($filetype =~ m/MARC21/ or $p_ignore_filetype eq 'true') { print "$filetype.\n" } - else { abort("File is not MARC21."); } - -my $i = 0; -my $uri_count = 0; -my $uri_valid_count = 0; -my $uri_sub9_count = 0; -my $author_sub0 = 0; -my $title_sub0 = 0; -my @uris; -my @fields; -my @codes; -my @holding_code_strings; -my %holding_counts; -my %barcode_counts; - -foreach (@ilses) { - $holding_counts{@$_[0]} = 0; - $barcode_counts{@$_[0]} = 0; -} - -while ( my $record = $batch->next() ) { - $i++; - #check holdings, bit time consuming but more future proof - foreach (@ilses) { - my $ils = @$_[0]; - my $hcode = @$_[1]; - my $barcode = @$_[2]; - my @holding_fields = $record->field($hcode); - my $l = scalar @holding_fields; - my $v = $holding_counts{$ils}; - if ($l) { $holding_counts{$ils} = $v + $l; } - } - #process 856s - @fields = $record->field('856'); - my $ldr = substr $record->leader(), 9, 1; - push @codes, $ldr; - foreach my $f (@fields) { - my $u = $f->subfield('u'); - my $n = $f->subfield('9'); - if (defined $n) { $uri_sub9_count++; } - if (defined $u) { - $uri_count++; - my $ind1 = $f->indicator('1'); - my $ind2 = $f->indicator('2'); - if ($ind1 eq '4') { - if ($ind2 eq '0' or $ind2 eq '1') { $uri_valid_count++; } - } - my $ustring = lc $f->as_string('u'); - $ustring =~ s/http:\/\///; - $ustring =~ s/ftp:\/\///; - $ustring =~ s/https:\/\///; - $ustring =~ s/\/.*//; - push @uris, $ustring; - } - } - #check for authority linking on 100s and 245s, if present may need to scrub them - @fields = $record->field('100'); - foreach my $f (@fields) { - my $t = $f->subfield('0'); - if (defined $t) { $title_sub0++; } - } - @fields = $record->field('245'); - foreach my $f (@fields) { - my $t = $f->subfield('0'); - if (defined $t) { $author_sub0++; } - } - if(($i % 1000) == 0) { print "Processing bib $i.\n"; } -} - -my %uri_counts; -$uri_counts{$_}++ for @uris; - -my %code_counts; -$code_counts{$_}++ for @codes; - -print "\n$filetype\n"; -print "$i bibs read in file\n\n"; - -print "=== Leader 09 codes\n"; -foreach my $key (keys %code_counts) { - my $value = $code_counts{$key}; - print "=== $key $value\n"; -} -print "\n"; - -print "$uri_count 856 fields with a subfield u\n"; -print "$uri_valid_count 856 fields with a subfield u and valid indicators\n"; -print "$uri_sub9_count 856 fields have subfield 9s\n"; -print "$title_sub0 100 fields have a subfield 0\n"; -print "$author_sub0 245 fields have a subfield 0\n"; - -print "\n=== Holdings Analysis\n"; -foreach my $key (keys %holding_counts) { - my $c = $holding_counts{$key}; - if (((100/$i)*$c) >= $holding_threshold) { print "Could be $key $holding_counts{$key} holdings tags\n"; } -} - -print "\nURI values are domains and filtered to only show those with more than $uri_threshold\n"; -foreach my $key (keys %uri_counts) { - my $value = $uri_counts{$key}; - if ($value > $uri_threshold) { print "=== $key $value\n"; } -} - -close $file; - -########### functions - -sub abort { - my $msg = shift; - print STDERR "$0: $msg", "\n"; - exit 1; -}