2 # -*- coding: iso-8859-15 -*-
3 ###############################################################################
6 =item B<bibstats> --file foo.mrc
8 Reads through a marc file to generate statistical information about the file
11 --uri_threshold defaults to 1, only shows URI values with more than that
14 --ignore_filetype true will have it not care what file returns as the type and
15 always treat it as marc21
17 --ils --holding_code --barcode_subfield work together to pass an new ILS
18 definnition without it being hardcode in the script and can test arbitary
21 --exportbarcodes ils_name is used if you want to export the barcodes associated
22 with one of the ILSes so provide the name
24 --exportbarcodesfile will use this file name for a barcode export instead
25 of the generic 'barcodes_export.txt'
30 ###############################################################################
37 HOME PGHOST PGPORT PGUSER PGDATABASE MIGSCHEMA
38 MIGBASEWORKDIR MIGBASEGITDIR MIGGITDIR MIGWORKDIR
48 use List::MoreUtils qw(uniq);
50 my $mig_bin = "$FindBin::Bin/";
51 use lib "$FindBin::Bin/";
54 use open ':encoding(utf8)';
56 pod2usage(-verbose => 2) if defined $ARGV[0] && $ARGV[0] eq '--help';
57 pod2usage(-verbose => 1) if ! $ARGV[1];
60 my $uri_threshold = 1;
62 my $p_barcode_subfield;
64 my $holding_threshold = 50;
65 my $p_ignore_filetype = 'false';
69 my $exportbarcodesfile;
73 'uri_threshold:i' => \$uri_threshold,
74 'holding_code:s' => \$p_holding_code,
75 'barcode_subfield:s' => \$p_barcode_subfield,
76 'ignore_filetype:s' => \$p_ignore_filetype,
77 'ils:s' => \$p_ils_name,
78 'exportbarcodes:s' => \$exportbarcodes,
79 'exportbarcodesfile:s' => \$exportbarcodesfile,
80 'holding_threshold:s' => \$holding_threshold
83 if ($exportbarcodesfile and !defined $exportbarcodes) { abort('You have to provide an ILS name if you want a barcode export file.'); }
85 if ($p_holding_code and length $p_holding_code != 3) { abort('Holdings codes must be three characters.'); }
87 if ($p_barcode_subfield) {
88 if (!defined $p_holding_code) { abort('A barcode field can not be used without a holding code.'); }
89 if (length $p_barcode_subfield != 1) { abort('Barcode subfields must be a single character code.'); }
92 # ils name, holding tag, barcode subfield
94 ['Mandarin','852','p'],
95 ['Evergreen','852','p'],
96 ['Polaris','852','p'],
99 ['Symphony','999','i'],
100 ['Destiny','852','p']
104 if ($p_holding_code) {
105 push @temp, $p_ils_name;
106 push @temp, $p_holding_code;
107 if ($p_barcode_subfield) { push @temp, lc $p_barcode_subfield; }
108 push @ilses, [@temp];
111 #to do - add a check for exportbarcodes being in @ilses
113 my $filetype = `file $file`;
115 if ($filetype =~ m/MARC21/) {
116 $batch = MARC::Batch->new( 'USMARC', $file );
118 $batch = MARC::Batch->new( 'XML', $file );
120 $batch->strict_off();
124 my $uri_valid_count = 0;
125 my $uri_sub9_count = 0;
132 my @holding_code_strings;
137 $holding_counts{@$_[0]} = 0;
138 $barcode_counts{@$_[0]} = 0;
141 while ( my $record = $batch->next() ) {
143 #check holdings, bit time consuming but more future proof
147 my $barcode = @$_[2];
148 my @holding_fields = $record->field($hcode);
149 foreach my $hf (@holding_fields) {
151 my $barcode_string = $hf->subfield($barcode);
153 push @h, $barcode_string;
154 push @holdings, [@h];
156 my $l = scalar @holding_fields;
157 my $v = $holding_counts{$ils};
158 if ($l) { $holding_counts{$ils} = $v + $l; }
161 @fields = $record->field('856');
162 my $enc = substr $record->leader(), 9, 1;
163 push @encodings, $enc;
164 my $type = substr $record->leader(), 6, 1;
166 foreach my $f (@fields) {
167 my $u = $f->subfield('u');
168 my $n = $f->subfield('9');
169 if (defined $n) { $uri_sub9_count++; }
172 my $ind1 = $f->indicator('1');
173 my $ind2 = $f->indicator('2');
175 if ($ind2 eq '0' or $ind2 eq '1') { $uri_valid_count++; }
177 my $ustring = lc $f->as_string('u');
178 $ustring =~ s/http:\/\///;
179 $ustring =~ s/ftp:\/\///;
180 $ustring =~ s/https:\/\///;
181 $ustring =~ s/\/.*//;
182 push @uris, $ustring;
185 #check for authority linking on 100s and 245s, if present may need to scrub them
186 @fields = $record->field('100');
187 foreach my $f (@fields) {
188 my $t = $f->subfield('0');
189 if (defined $t) { $title_sub0++; }
191 @fields = $record->field('245');
192 foreach my $f (@fields) {
193 my $t = $f->subfield('0');
194 if (defined $t) { $author_sub0++; }
196 if(($i % 1000) == 0) { print "Processing bib $i.\n"; }
202 foreach my $h (@holdings) {
203 my $temp_ils_name = @$h[0];
204 if ($temp_ils_name eq $ils) { push @temp_barcodes, @$h[1]; }
206 my @uniq_barcodes = uniq @temp_barcodes;;
207 $barcode_counts{$ils} = scalar @uniq_barcodes;
211 $uri_counts{$_}++ for @uris;
214 $encoding_counts{$_}++ for @encodings;
217 $type_counts{$_}++ for @types;
219 print "\n$filetype\n";
220 print "$i bibs read in file\n\n";
222 print "===== Leader 09, # = MARC-8, a = UCS/Unicode\n";
223 foreach my $key (keys %encoding_counts) {
224 my $value = $encoding_counts{$key};
225 print " $key $value\n";
229 print "===== Leader 06\n";
230 foreach my $key (keys %type_counts) {
231 my $value = $type_counts{$key};
232 my $type = give_type($key);
233 print " $key $value $type\n";
237 print "===== Summary of Select Field Counts\n";
238 print " $uri_count 856 fields with a subfield u\n";
239 print " $uri_valid_count 856 fields with a subfield u and valid indicators\n";
240 print " $uri_sub9_count 856 fields have a subfield 9\n";
241 print " $title_sub0 100 fields have a subfield 0\n";
242 print " $author_sub0 245 fields have a subfield 0\n";
244 print "\n===== Holdings Analysis\n";
245 foreach my $key (keys %holding_counts) {
246 my $c = $holding_counts{$key};
247 if (((100/$i)*$c) >= $holding_threshold) {
248 if ($key ne 'Symphony') { print " $key - $holding_counts{$key} holdings in $i bibs with $barcode_counts{$key} unique barcodes\n"; }
249 else { print " $key - $holding_counts{$key} holdings in $i bibs with $barcode_counts{$key} unique barcodes - 999 FIELDS MUST BE MOVED WITH kmig prepbibs\n"; }
253 print "\n===== URI values are domains and filtered to only show those with more than $uri_threshold\n";
254 foreach my $key (keys %uri_counts) {
255 my $value = $uri_counts{$key};
256 if ($value > $uri_threshold) { print " $key $value\n"; }
259 if ($exportbarcodes) {
262 if ($exportbarcodesfile) { $outfile = $exportbarcodesfile; } else { $outfile = 'barcodes_export.txt'; }
263 open my $out_fh, '>:utf8', $outfile or abort('can not open output file for barcode list');
264 foreach my $h (@holdings) {
265 my $temp_ils_name = @$h[0];
266 my $barcode = @$h[1];
267 if (!defined $barcode) { $barcode = 'no barcode found'; }
268 if ($temp_ils_name eq $exportbarcodes) { print $out_fh "@$h[1]\n" }
271 } else { print "No barcodes being exported.\n"; }
275 ########### functions
279 print STDERR "$0: $msg", "\n";
285 if ($type eq 'a') { return 'Language material'; }
286 if ($type eq 'c') { return 'Notated Music'; }
287 if ($type eq 'd') { return 'Manuscript notated music'; }
288 if ($type eq 'e') { return 'Cartographic material'; }
289 if ($type eq 'f') { return 'Manuscript cartographic material'; }
290 if ($type eq 'g') { return 'Projected Medium'; }
291 if ($type eq 'i') { return 'Nonmusical sound recording'; }
292 if ($type eq 'j') { return 'Musical sound recording'; }
293 if ($type eq 'k') { return 'Two-dimensional nonprojectable graphic'; }
294 if ($type eq 'm') { return 'Computer file'; }
295 if ($type eq 'o') { return 'Kit'; }
296 if ($type eq 'p') { return 'Mixed materials'; }
297 if ($type eq 'r') { return 'Three-dimensaional artifact or naturally occurring object'; }
298 if ($type eq 't') { return 'Manuscript language material'; }
299 if ($type eq 'z') { return 'Authority'; }