From 500acbdf4f526f07aa147b854409fc3ed272d23b Mon Sep 17 00:00:00 2001 From: Rogan Hamby Date: Mon, 13 Jul 2020 10:48:22 -0400 Subject: [PATCH] make a message more meaningful --- kmig.d/bin/mig-bibstats | 295 +++++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 295 insertions(+), 0 deletions(-) create mode 100755 kmig.d/bin/mig-bibstats diff --git a/kmig.d/bin/mig-bibstats b/kmig.d/bin/mig-bibstats new file mode 100755 index 0000000..c115dd6 --- /dev/null +++ b/kmig.d/bin/mig-bibstats @@ -0,0 +1,295 @@ +#!/usr/bin/perl +# -*- coding: iso-8859-15 -*- +############################################################################### +=pod + +=item B --file foo.mrc + +Reads through a marc file to generate statistical information about the file +for quick analysis. + +--uri_threshold defaults to 1, only shows URI values with more than that +frequency + +--ignore_filetype true will have it not care what file returns as the type and +always treat it as marc21 + +--ils --holding_code --barcode_subfield work together to pass an new ILS +definnition without it being hardcode in the script and can test arbitary +fields + +--exportbarcodes ils_name is used if you want to export the barcodes associated +with one of the ILSes so provide the name + +--exportbarcodesfile will use this file name for a barcode export instead +of the generic 'barcodes_export.txt' + +=back +=cut + +############################################################################### + +use strict; +use warnings; + +use Data::Dumper; +use Env qw( + HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA + MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR +); +use Pod::Usage; +use Switch; +use Getopt::Long; +use MARC::Batch; +use MARC::Record; +use MARC::Field; +use Cwd 'abs_path'; +use Cwd qw(getcwd); +use List::MoreUtils qw(uniq); +use FindBin; +my $mig_bin = "$FindBin::Bin/"; +use lib "$FindBin::Bin/"; +#use EMig; +use KMig; +use open ':encoding(utf8)'; + +pod2usage(-verbose => 2) if defined $ARGV[0] && $ARGV[0] eq '--help'; +pod2usage(-verbose => 1) if ! $ARGV[1]; + +my $file; +my $uri_threshold = 1; +my $p_holding_code; +my $p_barcode_subfield; +my $p_ils_name = ''; +my $holding_threshold = 50; +my $p_ignore_filetype = 'false'; +my @holdings; +my %unique_barcodes; +my $exportbarcodes; +my $exportbarcodesfile; + +my $ret = GetOptions( + 'file:s' => \$file, + 'uri_threshold:i' => \$uri_threshold, + 'holding_code:s' => \$p_holding_code, + 'barcode_subfield:s' => \$p_barcode_subfield, + 'ignore_filetype:s' => \$p_ignore_filetype, + 'ils:s' => \$p_ils_name, + 'exportbarcodes:s' => \$exportbarcodes, + 'exportbarcodesfile:s' => \$exportbarcodesfile, + 'holding_threshold:s' => \$holding_threshold +); + +if ($exportbarcodesfile and !defined $exportbarcodes) { abort('You have to provide an ILS name if you want a barcode export file.'); } + +if ($p_holding_code and length $p_holding_code != 3) { abort('Holdings codes must be three characters.'); } + +if ($p_barcode_subfield) { + if (!defined $p_holding_code) { abort('A barcode field can not be used without a holding code.'); } + if (length $p_barcode_subfield != 1) { abort('Barcode subfields must be a single character code.'); } +} + +# ils name, holding tag, barcode subfield +my @ilses = ( + ['Mandarin','852','p'], + ['Evergreen','852','p'], + ['Polaris','852','p'], + ['TLC','949','g'], + ['Koha','952','p'], + ['Sympony','999','i'], + ['Destiny','852','p'] +); + +my @temp; +if ($p_holding_code) { + push @temp, $p_ils_name; + push @temp, $p_holding_code; + if ($p_barcode_subfield) { push @temp, lc $p_barcode_subfield; } + push @ilses, [@temp]; +} + +#to do - add a check for exportbarcodes being in @ilses + +my $batch = MARC::Batch->new('USMARC', $file); +$batch->strict_off(); +my $filetype = `file $file`; +if ($filetype =~ m/MARC21/ or $p_ignore_filetype eq 'true') { print "$filetype.\n" } + else { abort("File is not MARC21."); } + +my $i = 0; +my $uri_count = 0; +my $uri_valid_count = 0; +my $uri_sub9_count = 0; +my $author_sub0 = 0; +my $title_sub0 = 0; +my @uris; +my @fields; +my @encodings; +my @types; +my @holding_code_strings; +my %holding_counts; +my %barcode_counts; + +foreach (@ilses) { + $holding_counts{@$_[0]} = 0; + $barcode_counts{@$_[0]} = 0; +} + +while ( my $record = $batch->next() ) { + $i++; + #check holdings, bit time consuming but more future proof + foreach (@ilses) { + my $ils = @$_[0]; + my $hcode = @$_[1]; + my $barcode = @$_[2]; + my @holding_fields = $record->field($hcode); + foreach my $hf (@holding_fields) { + my @h; + my $barcode_string = $hf->subfield($barcode); + push @h, $ils; + push @h, $barcode_string; + push @holdings, [@h]; + } + my $l = scalar @holding_fields; + my $v = $holding_counts{$ils}; + if ($l) { $holding_counts{$ils} = $v + $l; } + } + #process 856s + @fields = $record->field('856'); + my $enc = substr $record->leader(), 9, 1; + push @encodings, $enc; + my $type = substr $record->leader(), 6, 1; + push @types, $type; + foreach my $f (@fields) { + my $u = $f->subfield('u'); + my $n = $f->subfield('9'); + if (defined $n) { $uri_sub9_count++; } + if (defined $u) { + $uri_count++; + my $ind1 = $f->indicator('1'); + my $ind2 = $f->indicator('2'); + if ($ind1 eq '4') { + if ($ind2 eq '0' or $ind2 eq '1') { $uri_valid_count++; } + } + my $ustring = lc $f->as_string('u'); + $ustring =~ s/http:\/\///; + $ustring =~ s/ftp:\/\///; + $ustring =~ s/https:\/\///; + $ustring =~ s/\/.*//; + push @uris, $ustring; + } + } + #check for authority linking on 100s and 245s, if present may need to scrub them + @fields = $record->field('100'); + foreach my $f (@fields) { + my $t = $f->subfield('0'); + if (defined $t) { $title_sub0++; } + } + @fields = $record->field('245'); + foreach my $f (@fields) { + my $t = $f->subfield('0'); + if (defined $t) { $author_sub0++; } + } + if(($i % 1000) == 0) { print "Processing bib $i.\n"; } +} + +foreach (@ilses) { + my $ils = @$_[0]; + my @temp_barcodes; + foreach my $h (@holdings) { + my $temp_ils_name = @$h[0]; + if ($temp_ils_name eq $ils) { push @temp_barcodes, @$h[1]; } + } + my @uniq_barcodes = uniq @temp_barcodes;; + $barcode_counts{$ils} = scalar @uniq_barcodes; +} + +my %uri_counts; +$uri_counts{$_}++ for @uris; + +my %encoding_counts; +$encoding_counts{$_}++ for @encodings; + +my %type_counts; +$type_counts{$_}++ for @types; + +print "\n$filetype\n"; +print "$i bibs read in file\n\n"; + +print "===== Leader 09, # = MARC-8, a = UCS/Unicode\n"; +foreach my $key (keys %encoding_counts) { + my $value = $encoding_counts{$key}; + print " $key $value\n"; +} +print "\n"; + +print "===== Leader 06\n"; +foreach my $key (keys %type_counts) { + my $value = $type_counts{$key}; + my $type = give_type($key); + print " $key $value $type\n"; +} +print "\n"; + +print "===== Summary of Select Field Counts\n"; +print " $uri_count 856 fields with a subfield u\n"; +print " $uri_valid_count 856 fields with a subfield u and valid indicators\n"; +print " $uri_sub9_count 856 fields have a subfield 9\n"; +print " $title_sub0 100 fields have a subfield 0\n"; +print " $author_sub0 245 fields have a subfield 0\n"; + +print "\n===== Holdings Analysis\n"; +foreach my $key (keys %holding_counts) { + my $c = $holding_counts{$key}; + if (((100/$i)*$c) >= $holding_threshold) { print " $key $holding_counts{$key} holdings in $i bibs with $barcode_counts{$key} unique barcodes\n"; } +} + +print "\n===== URI values are domains and filtered to only show those with more than $uri_threshold\n"; +foreach my $key (keys %uri_counts) { + my $value = $uri_counts{$key}; + if ($value > $uri_threshold) { print " $key $value\n"; } +} + +if ($exportbarcodes) { + my @temp_barcodes; + my $outfile; + if ($exportbarcodesfile) { $outfile = $exportbarcodesfile; } else { $outfile = 'barcodes_export.txt'; } + open my $out_fh, '>:utf8', $outfile or abort('can not open output file for barcode list'); + foreach my $h (@holdings) { + my $temp_ils_name = @$h[0]; + my $barcode = @$h[1]; + if (!defined $barcode) { $barcode = 'no barcode found'; } + if ($temp_ils_name eq $exportbarcodes) { print $out_fh "@$h[1]\n" } + } + close $out_fh; +} else { print "No barcodes being exported.\n"; } + +close $file; + +########### functions + +sub abort { + my $msg = shift; + print STDERR "$0: $msg", "\n"; + exit 1; +} + +sub give_type { + my $type = shift; + if ($type eq 'a') { return 'Language material'; } + if ($type eq 'c') { return 'Notated Music'; } + if ($type eq 'd') { return 'Manuscript notated music'; } + if ($type eq 'e') { return 'Cartographic material'; } + if ($type eq 'f') { return 'Manuscript cartographic material'; } + if ($type eq 'g') { return 'Projected Medium'; } + if ($type eq 'i') { return 'Nonmusical sound recording'; } + if ($type eq 'j') { return 'Musical sound recording'; } + if ($type eq 'k') { return 'Two-dimensional nonprojectable graphic'; } + if ($type eq 'm') { return 'Computer file'; } + if ($type eq 'o') { return 'Kit'; } + if ($type eq 'p') { return 'Mixed materials'; } + if ($type eq 'r') { return 'Three-dimensaional artifact or naturally occurring object'; } + if ($type eq 't') { return 'Manuscript language material'; } + if ($type eq 'z') { return 'Authority'; } + return 'unknown'; +} -- 1.7.2.5