#!/usr/bin/perl # -*- coding: iso-8859-15 -*- ############################################################################### =pod =item B --file foo.mrc Reads through a marc file to generate statistical information about the file for quick analysis. --uri_threshold defaults to 1, only shows URI values with more than that frequency --ignore_filetype true will have it not care what file returns as the type and always treat it as marc21 --ils --holding_code --barcode_subfield work together to pass an new ILS definnition without it being hardcode in the script and can test arbitary fields --exportbarcodes ils_name is used if you want to export the barcodes associated with one of the ILSes so provide the name --exportbarcodesfile will use this file name for a barcode export instead of the generic 'barcodes_export.txt' --item_type_subfield will make use of --holding_code and provide a breakdown of bib types by item types. If --branch_subfield is also provided, then the breakdown will be further subdivided by branch. =back =cut ############################################################################### use strict; use warnings; use Data::Dumper; use Env qw( HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR ); use Pod::Usage; use Switch; use Getopt::Long; use MARC::Batch; use MARC::Record; use MARC::Field; use Cwd 'abs_path'; use Cwd qw(getcwd); use List::MoreUtils qw(uniq); use FindBin; my $mig_bin = "$FindBin::Bin/"; use lib "$FindBin::Bin/"; use EMig; #use KMig; use open ':encoding(utf8)'; pod2usage(-verbose => 2) if defined $ARGV[0] && $ARGV[0] eq '--help'; pod2usage(-verbose => 1) if ! $ARGV[1]; my $file; my $uri_threshold = 1; my $p_holding_code; my $p_barcode_subfield; my $p_item_type_subfield; my $p_branch_subfield; my $p_ils_name = ''; my $holding_threshold = 50; my $p_ignore_filetype = 'false'; my @holdings; my %unique_barcodes; my $exportbarcodes; my $exportbarcodesfile; my $ret = GetOptions( 'file:s' => \$file, 'uri_threshold:i' => \$uri_threshold, 'holding_code:s' => \$p_holding_code, 'barcode_subfield:s' => \$p_barcode_subfield, 'item_type_subfield:s' => \$p_item_type_subfield, 'branch_subfield:s' => \$p_branch_subfield, 'ignore_filetype:s' => \$p_ignore_filetype, 'ils:s' => \$p_ils_name, 'exportbarcodes:s' => \$exportbarcodes, 'exportbarcodesfile:s' => \$exportbarcodesfile, 'holding_threshold:s' => \$holding_threshold ); if ($exportbarcodesfile and !defined $exportbarcodes) { abort('You have to provide an ILS name if you want a barcode export file.'); } if ($p_holding_code and length $p_holding_code != 3) { abort('Holdings codes must be three characters.'); } if ($p_barcode_subfield) { if (!defined $p_holding_code) { abort('A barcode field can not be used without a holding code.'); } if (length $p_barcode_subfield != 1) { abort('Barcode subfields must be a single character code.'); } } if ($p_item_type_subfield) { if (!defined $p_holding_code) { abort('An item type field can not be used without a holding code.'); } if (length $p_item_type_subfield != 1) { abort('Item type subfields must be a single character code.'); } } if ($p_branch_subfield) { if (!defined $p_holding_code) { abort('A branch field can not be used without a holding code.'); } if (length $p_branch_subfield != 1) { abort('Branch subfields must be a single character code.'); } } # ils name, holding tag, barcode subfield my @ilses = ( ['Mandarin','852','p'], ['Evergreen','852','p'], ['Polaris','852','p'], ['TLC','949','g'], ['Koha','952','p'], ['Sympony','999','i'], ['Destiny','852','p'] ); my @temp; if (defined $p_holding_code && defined $p_ils_name && defined $p_barcode_subfield) { push @temp, $p_ils_name; push @temp, $p_holding_code; if ($p_barcode_subfield) { push @temp, lc $p_barcode_subfield; } push @ilses, [@temp]; } #to do - add a check for exportbarcodes being in @ilses my $filetype = `file $file`; my $batch; if ($filetype =~ m/MARC21/) { $batch = MARC::Batch->new( 'USMARC', $file ); } else { $batch = MARC::Batch->new( 'XML', $file ); } $batch->strict_off(); my $i = 0; my $uri_count = 0; my $uri_valid_count = 0; my $uri_sub9_count = 0; my $author_sub0 = 0; my $title_sub0 = 0; my @uris; my @fields; my @encodings; my @types; my %bib_types_by_item_type; my @holding_code_strings; my %holding_counts; my %barcode_counts; foreach (@ilses) { $holding_counts{@$_[0]} = 0; $barcode_counts{@$_[0]} = 0; } while ( my $record = $batch->next() ) { $i++; #check holdings, bit time consuming but more future proof foreach (@ilses) { my $ils = @$_[0]; my $hcode = @$_[1]; my $barcode = @$_[2]; my @holding_fields = $record->field($hcode); foreach my $hf (@holding_fields) { my @h; my $barcode_string = $hf->subfield($barcode); push @h, $ils; push @h, $barcode_string; push @holdings, [@h]; } my $l = scalar @holding_fields; my $v = $holding_counts{$ils}; if ($l) { $holding_counts{$ils} = $v + $l; } } #process 856s @fields = $record->field('856'); my $enc = substr $record->leader(), 9, 1; push @encodings, $enc; my $type = substr $record->leader(), 6, 1; push @types, $type; # bib type by branch and by item type if item subfield (and optionally branch subfield) provided if (defined $p_holding_code && defined $p_item_type_subfield) { my @holding_fields = $record->field($p_holding_code); foreach my $hf (@holding_fields) { my $item_type = $hf->subfield($p_item_type_subfield) || ''; my $branch = $p_branch_subfield ? $hf->subfield($p_branch_subfield) : 'default'; if (! defined $bib_types_by_item_type{ $branch }) { $bib_types_by_item_type{ $branch } = {}; } if (! defined $bib_types_by_item_type{ $branch }{ $type }) { $bib_types_by_item_type{ $branch }{ $type } = {}; } if (! defined $bib_types_by_item_type{ $branch }{ $type }{ $item_type }) { $bib_types_by_item_type{ $branch }{ $type }{ $item_type } = 0; } $bib_types_by_item_type{ $branch }{ $type }{ $item_type }++; } } foreach my $f (@fields) { my $u = $f->subfield('u'); my $n = $f->subfield('9'); if (defined $n) { $uri_sub9_count++; } if (defined $u) { $uri_count++; my $ind1 = $f->indicator('1'); my $ind2 = $f->indicator('2'); if ($ind1 eq '4') { if ($ind2 eq '0' or $ind2 eq '1') { $uri_valid_count++; } } my $ustring = lc $f->as_string('u'); $ustring =~ s/http:\/\///; $ustring =~ s/ftp:\/\///; $ustring =~ s/https:\/\///; $ustring =~ s/\/.*//; push @uris, $ustring; } } #check for authority linking on 100s and 245s, if present may need to scrub them @fields = $record->field('100'); foreach my $f (@fields) { my $t = $f->subfield('0'); if (defined $t) { $title_sub0++; } } @fields = $record->field('245'); foreach my $f (@fields) { my $t = $f->subfield('0'); if (defined $t) { $author_sub0++; } } if(($i % 1000) == 0) { print "Processing bib $i.\n"; } } foreach (@ilses) { my $ils = @$_[0]; my @temp_barcodes; foreach my $h (@holdings) { my $temp_ils_name = @$h[0]; if ($temp_ils_name eq $ils) { push @temp_barcodes, @$h[1]; } } my @uniq_barcodes = uniq @temp_barcodes;; $barcode_counts{$ils} = scalar @uniq_barcodes; } my %uri_counts; $uri_counts{$_}++ for @uris; my %encoding_counts; $encoding_counts{$_}++ for @encodings; my %type_counts; $type_counts{$_}++ for @types; print "\n$filetype\n"; print "$i bibs read in file\n\n"; print "===== Leader 09, # = MARC-8, a = UCS/Unicode\n"; foreach my $key (keys %encoding_counts) { my $value = $encoding_counts{$key}; print " $key $value\n"; } print "\n"; print "===== Leader 06\n"; foreach my $key (keys %type_counts) { my $value = $type_counts{$key}; my $type = give_type($key); print " $key $value $type\n"; } print "\n"; if ($p_item_type_subfield) { print "===== Branch / Leader 06 / Item Type\n"; foreach my $branch (keys %bib_types_by_item_type) { foreach my $btype (keys %{ $bib_types_by_item_type{$branch} }) { foreach my $itype (keys %{ $bib_types_by_item_type{$branch}{$btype} }) { my $count = $bib_types_by_item_type{$branch}{$btype}{$itype}; print "$branch\t$btype (" . give_type($btype) . ")\t$itype\t$count\n"; } } } print "\n"; } print "===== Summary of Select Field Counts\n"; print " $uri_count 856 fields with a subfield u\n"; print " $uri_valid_count 856 fields with a subfield u and valid indicators\n"; print " $uri_sub9_count 856 fields have a subfield 9\n"; print " $title_sub0 100 fields have a subfield 0\n"; print " $author_sub0 245 fields have a subfield 0\n"; print "\n===== Holdings Analysis\n"; foreach my $key (keys %holding_counts) { my $c = $holding_counts{$key}; if (((100/$i)*$c) >= $holding_threshold) { print " $key $holding_counts{$key} holdings in $i bibs with $barcode_counts{$key} unique barcodes\n"; } } print "\n===== URI values are domains and filtered to only show those with more than $uri_threshold\n"; foreach my $key (keys %uri_counts) { my $value = $uri_counts{$key}; if ($value > $uri_threshold) { print " $key $value\n"; } } if ($exportbarcodes) { my @temp_barcodes; my $outfile; if ($exportbarcodesfile) { $outfile = $exportbarcodesfile; } else { $outfile = 'barcodes_export.txt'; } open my $out_fh, '>:utf8', $outfile or abort('can not open output file for barcode list'); foreach my $h (@holdings) { my $temp_ils_name = @$h[0]; my $barcode = @$h[1]; if (!defined $barcode) { $barcode = 'no barcode found'; } if ($temp_ils_name eq $exportbarcodes) { print $out_fh "@$h[1]\n" } } close $out_fh; } else { print "frack\n"; } close $file; ########### functions sub abort { my $msg = shift; print STDERR "$0: $msg", "\n"; exit 1; } sub give_type { my $type = shift; if ($type eq 'a') { return 'Language material'; } if ($type eq 'c') { return 'Notated Music'; } if ($type eq 'd') { return 'Manuscript notated music'; } if ($type eq 'e') { return 'Cartographic material'; } if ($type eq 'f') { return 'Manuscript cartographic material'; } if ($type eq 'g') { return 'Projected Medium'; } if ($type eq 'i') { return 'Nonmusical sound recording'; } if ($type eq 'j') { return 'Musical sound recording'; } if ($type eq 'k') { return 'Two-dimensional nonprojectable graphic'; } if ($type eq 'm') { return 'Computer file'; } if ($type eq 'o') { return 'Kit'; } if ($type eq 'p') { return 'Mixed materials'; } if ($type eq 'r') { return 'Three-dimensaional artifact or naturally occurring object'; } if ($type eq 't') { return 'Manuscript language material'; } if ($type eq 'z') { return 'Authority'; } return 'unknown'; }