extract_holdings

   1 #!/usr/bin/perl
   2 use strict;
   3 use warnings;
   4
   5 use Getopt::Long;
   6 use Equinox::Migration::MapDrivenMARCXMLProc;
   7 use Equinox::Migration::MARCXMLSampler;
   8
   9 my $VERSION = '1.001';
  10
  11 =pod
  12
  13 TODO
  14
  15   * Have detail mode report on number of subfields per datafield
  16
  17 =cut
  18
  19 my $c = initialize();
  20 $| = 1;
  21
  22 # run samples if we've been asked for them
  23 run_samples($c) if ($c->{sample} or $c->{samplemap} or $c->{samplestr});
  24 extract_holdings($c) if ($c->{map});
  25
  26 #--------------------------
  27
  28 sub extract_holdings {
  29     my ($c) = @_;
  30     print "Parsing records for extraction... ";
  31     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( marcfile => $c->{marcfile},
  32                                                            mapfile  => $c->{map},
  33                                                            verbose  => 1,
  34                                                          );
  35     print "Writing holdings to output file(s)...\n";
  36     # open main holdings file
  37     open HOLDINGS, '>', ($c->{prefix} . "-HOLDINGS.pg");
  38     # open the files for multi mappings
  39     # FIXME make this actually key off multi fields
  40     open X, '>', ($c->{prefix} . "-HOLDINGS-privnotes.pg");
  41     open Z, '>', ($c->{prefix} . "-HOLDINGS-pubnotes.pg");
  42     select HOLDINGS;
  43
  44     my $i = 0;
  45     for my $rec ( @{$m->{data}{recs}} ) {
  46         # for each holdings tag in the record...
  47         for my $holdidx ( @{$rec->{tmap}{ $c->{holdings} }} ) {
  48             my $tagid = $rec->{tags}[$holdidx]{tag};
  49
  50             print STDOUT "\r$i";
  51             my @out = ();            # clear the output buffer
  52             push @out, $rec->{egid}; # slug in the egid first thing
  53             print "BEGIN;\n\n" unless $i;
  54
  55             # grab the unary mappings and slug 'em in
  56             for my $sub ( sort keys %{$rec->{tags}[$holdidx]{uni}} ) {
  57                 push @out, $rec->{tags}[$holdidx]{uni}{$sub};
  58                 #print "l_", $m->name($tagid, $sub),"\t"
  59                 #  unless $i;
  60                 # FIXME column names should be made workable again
  61             }
  62
  63             for my $x (@{$rec->{tags}[$holdidx]{multi}{x}} ) {
  64                 print X $rec->{egid}, "\t",
  65                   $rec->{tags}[$holdidx]{uni}{ $c->{copyid} },
  66                     "\t$x\n";
  67             }
  68             for my $z (@{$rec->{tags}[$holdidx]{multi}{z}} ) {
  69                 print Z $rec->{egid}, "\t",
  70                   $rec->{tags}[$holdidx]{uni}{ $c->{copyid} },
  71                     "\t$z\n";
  72             }
  73
  74
  75             # now get everything else in the mapping
  76             for my $othertag ( sort keys %{$rec->{tmap}} ) {
  77                 next if $othertag eq $c->{holdings};  # ignoring the holdings, o'course
  78                 my $idx = $rec->{tmap}{$othertag}[0]; # get index into tags struct
  79                 for my $sub ( sort keys %{$rec->{tags}[$idx]{uni}} ) {
  80                     push @out, $rec->{tags}[$idx]{uni}{$sub};
  81                     print "l_", $m->name($rec->{tags}[$idx]{tag}, $sub), "\t"
  82                       unless $i;
  83                 }
  84             }
  85
  86             # and dump it
  87             print "\n" if ($i == 1);
  88             print join("\t", @out);
  89             print "\n";
  90         }
  91         $i++;
  92     }
  93     select STDOUT;
  94     print "\n";
  95 }
  96
  97 #--------------------------
  98
  99 sub run_samples {
 100     my ($c) = @_;
 101     my $s;
 102     if ($c->{samplemap}) {
 103         $s = Equinox::Migration::MARCXMLSampler->new( marcfile => $c->{marcfile},
 104                                                       mapfile  => $c->{samplemap});
 105     } elsif ($c->{samplestr}) {
 106         $s = Equinox::Migration::MARCXMLSampler->new( marcfile  => $c->{marcfile},
 107                                                       mapstring => $c->{samplestr});
 108     } else {
 109         $s = Equinox::Migration::MARCXMLSampler->new( marcfile => $c->{marcfile} );
 110     }
 111     print "Parsing records for sampling... ";
 112     $s->parse_records;
 113
 114     dump_sample_overview($c, $s) if $c->{sample};
 115     dump_sample_detail($c, $s) if ($c->{samplemap} or $c->{samplestr});
 116 }
 117
 118 sub dump_sample_detail {
 119     my ($c, $s) = @_;
 120     my $tags = $s->{data}{samp};
 121     my $count = $s->{data}{rcnt};
 122     my $scnt  = $s->{data}{scnt};
 123
 124     open DETAIL, '>', ($c->{prefix} . "-HOLDINGS-DETAIL.txt");
 125     select DETAIL;
 126     for my $tag (sort keys %{ $tags }) {
 127         print ">>>>> TAG $tag\n\n";
 128         for my $subkey (sort keys %{ $tags->{$tag} }) {
 129             my $sub = $tags->{$tag}{$subkey};
 130             print "|| $subkey | ", $sub->{value}, " | ",
 131               $sub->{count}, "/", $sub->{tcnt}, " |  ||\n";
 132         }
 133         print "\n";
 134     }
 135     close DETAIL;
 136     open SCOUNT, '>', ($c->{prefix} . "-HOLDINGS-SUBCOUNTS.txt");
 137     select SCOUNT;
 138     for my $tag (sort keys %{ $scnt }) {
 139         print ">>>>> TAG $tag\n\n";
 140         for my $len (sort keys %{ $scnt->{$tag} })
 141           { print "|| $len | ", $scnt->{$tag}{$len}, " ||\n" }
 142         print "\n";
 143     }
 144     select STDOUT;
 145 }
 146
 147 sub dump_sample_overview {
 148     my ($c, $s) = @_;
 149     my $tags = $s->{data}{tags};
 150     my $count = $s->{data}{rcnt};
 151
 152     my @tagsbyname  = sort keys %{$tags};
 153     my @tagsbycount = reverse sort { $tags->{$a} <=> $tags->{$b} } keys %{$tags};
 154
 155     open SAMPLE, '>', ($c->{prefix} . "-HOLDINGS-OVERVIEW.txt");
 156     select SAMPLE;
 157     print "SAMPLE REPORT FOR ", $c->{prefix},": $count records\n\n";
 158     print "FOUND TAGS (BY TAG)           FOUND TAGS (BY COUNT)\n";
 159     print "------------------------      --------------------------\n";
 160     for my $i (0 .. @tagsbyname - 1) {
 161         print $tagsbyname[$i], (" " x (14 - length $tags->{ $tagsbyname[$i] })),
 162           $tags->{ $tagsbyname[$i] };
 163         print " (", sprintf("%03d", int($tags->{ $tagsbyname[$i] } / $count * 100)), "%)";
 164         print "      ";
 165         print $tagsbycount[$i], (" " x (16 - length $tags->{ $tagsbycount[$i] })),
 166           $tags->{ $tagsbycount[$i] };
 167         print " (", sprintf("%03d", int($tags->{ $tagsbycount[$i] } / $count * 100)), "%)\n";
 168     }
 169     select STDOUT;
 170     print "\n";
 171     close SAMPLE;
 172 }
 173
 174 #--------------------------
 175
 176 sub initialize {
 177     my $c = {};
 178     my @missing = ();
 179
 180     # set mode on existing filehandles
 181     binmode(STDIN, ':utf8');
 182
 183     my $rc = GetOptions( $c,
 184                          'sample|s',
 185                          'samplemap|sm=s',
 186                          'samplestr|ss=s',
 187                          'marcfile|m=s',
 188                          'map=s',
 189                          'holdings|h=i',
 190                          'copyid|c=s',
 191                          'prefix|p=s',
 192                          'version|v',
 193                          'help',
 194                        );
 195     show_help() unless $rc;
 196     show_help() if ($c->{help});
 197     show_help("Nothing to do!")
 198       unless ($c->{map} or $c->{sample} or $c->{samplemap} or $c->{samplestr});
 199     show_help("map, holdings, and copyid must be specified together!")
 200       if ($c->{map} and !($c->{holdings} and $c->{copyid}));
 201     show_version() if $c->{version};
 202
 203     my @keys = keys %{$c};
 204     for my $key ('prefix', 'marcfile')
 205       { push @missing, $key unless $c->{$key} }
 206     if (@missing) {
 207         print "Required option: ", join(', ', @missing), " missing!\n";
 208         show_help();
 209     }
 210
 211     return $c;
 212 }
 213
 214 sub show_help {
 215     my ($msg) = @_;
 216     print "\nERROR - $msg\n" if $msg;
 217     print <<HELP;
 218
 219 Usage is: extract_holdings -p PREFIX -m MARCFILE [ARGUMENTS]
 220
 221 REQUIRED ARGUMENTS
 222   --prefix   -p  Prefix string for output filenames
 223   --marcfile -m  MARCXML to use as source data
 224
 225 SAMPLING ARGUMENTS
 226   --sample    -s   Generate a report of all tags in the MARC data
 227   --samplemap -sm  Specify a E::M::STL map file which will be used to generate
 228                    subfield breakdown reports about specific tags in the MARC
 229                    data
 230   --samplestr -ss  As above, but with a one-liner map specified on the command
 231                    line as a string (e.g. '-ss "852 999"')
 232
 233   If --samplemap and --samplestr are both specified, --samplemap wins.
 234
 235 HOLDINGS EXTRACTION ARGUMENTS
 236   --map          E::M::SM map file which will be used to extract holdings data
 237                  from the input MARC file
 238   --holdings -h  Specifies actual holdings tag
 239   --copyid   -c  Specifies subfield of holdings with unique copy identifier
 240
 241   Both these must be given together.
 242 HELP
 243     exit;
 244 }
 245
 246 sub show_version { print "extract_holdings v$VERSION\n"; exit }