Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm

   1 package Equinox::Migration::MapDrivenMARCXMLProc;
   2
   3 use warnings;
   4 use strict;
   5
   6 use XML::Twig;
   7 use Equinox::Migration::SubfieldMapper 1.002;
   8
   9 # FIXME
  10 #
  11 # sample functionality should be extracted into a new module which
  12 # uses E::M::SM to drive sampling of individual datafields, and
  13 # reports ALL datafields which occur
  14 #
  15 # --sample should give the list of all datafields
  16 # --samplefile should take a SM map as teh argument and introspect the mapped datafields
  17
  18
  19 =head1 NAME
  20
  21 Equinox::Migration::MapDrivenMARCXMLProc
  22
  23 =head1 VERSION
  24
  25 Version 1.000
  26
  27 =cut
  28
  29 our $VERSION = '1.000';
  30
  31
  32 =head1 SYNOPSIS
  33
  34 Foo
  35
  36     use Equinox::Migration::MapDrivenMARCXMLProc;
  37
  38
  39 =head1 METHODS
  40
  41
  42 =head2 new
  43
  44 Takes two required arguments: C<mapfile> (which will be passed along
  45 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
  46 and C<marcfile> (the MARC data to be processed).
  47
  48     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  49                                                            marcfile => FILE );
  50
  51 There is an optional third, argument, C<sample>, which specifies a
  52 arrayref of datafields to "sample" by reporting on subfields which are
  53 found in the data but not in the map.
  54
  55     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  56                                                            marcfile => FILE,
  57                                                            sample   => \@TAGS
  58                                                          );
  59
  60 See L</UNMAPPED TAGS> for more info.
  61
  62 =cut
  63
  64 sub new {
  65     my ($class, %args) = @_;
  66
  67     my $self = bless { mods => { multi    => {},
  68                                  once     => {},
  69                                  required => {},
  70                                },
  71                        data => { recs => undef, # X::T record objects
  72                                  rptr => 0,     # next record pointer
  73                                  crec => undef, # parsed record storage
  74                                  stag => undef, # list of tags to sample
  75                                  umap => undef, # unmapped data samples
  76                                },
  77                      }, $class;
  78
  79     # initialize map and taglist
  80     die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
  81     my @mods = keys %{$self->{mods}};
  82     $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
  83                                                             mods => \@mods );
  84     $self->{data}{tags} = $self->{map}->tags;
  85
  86     # initialize twig
  87     die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
  88     if (-r $args{marcfile}) {
  89         $self->{twig} = XML::Twig->new;
  90         $self->{twig}->parsefile($args{marcfile});
  91         my @records = $self->{twig}->root->children;
  92         $self->{data}{recs} = \@records;
  93     } else {
  94         die "Can't open marc file: $!\n";
  95     }
  96
  97     # if we have a sample arg, set up the sample set and umap hash
  98     if (defined $args{sample}) {
  99         for my $s ( @{$args{sample}})
 100           { $self->{data}{stag}{$s} = 1 }
 101         $self->{data}{umap} = {};
 102     }
 103
 104     return $self;
 105 }
 106
 107
 108 =head2 parse_record
 109
 110 Extracts data from the next record, per the mapping file. Returns a
 111 normalized datastructure (see L</format_record> for details) on
 112 success; returns 0 otherwise.
 113
 114     while (my $rec = $m->parse_record) {
 115       # handle extracted record data
 116     }
 117
 118 =cut
 119
 120 sub parse_record {
 121     my ($self) = @_;
 122
 123     # get the next record and wipe current parsed record
 124     return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
 125     my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
 126     $self->{data}{crec} = { egid => undef, bib  => undef, tags => undef };
 127
 128     my @fields = $record->children;
 129     for my $f (@fields)
 130       { $self->process_field($f) }
 131
 132     # cleanup memory and increment pointer
 133     $record->purge;
 134     $self->{data}{rptr}++;
 135
 136     return $self->{data}{crec};
 137 }
 138
 139 sub process_field {
 140     my ($self, $field) = @_;
 141     my $map = $self->{map};
 142     my $tag = $field->{'att'}->{'tag'};
 143     my $crec = $self->{data}{crec};
 144
 145     # leader
 146     unless (defined $tag) {
 147         #FIXME
 148         return;
 149     }
 150
 151     # datafields
 152     if ($tag == 903) {
 153         my $sub = $field->first_child('subfield');
 154         $crec->{egid} = $sub->text;
 155         return;
 156     }
 157     if ($map->has($tag)) {
 158         push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
 159         my @subs = $field->children('subfield');
 160         for my $sub (@subs)
 161           { $self->process_subs($tag, $sub) }
 162         # check map to ensure all declared subs have a value
 163         my $mods = $map->mods($field);
 164         for my $mappedsub ( @{ $map->subfields($tag) } ) {
 165             next if $mods->{multi};
 166             $crec->{tags}[-1]{uni}{$mappedsub} = ''
 167               unless defined $crec->{tags}[-1]{uni}{$mappedsub};
 168         }
 169     }
 170 }
 171
 172 sub process_subs {
 173     my ($self, $tag, $sub) = @_;
 174     my $map  = $self->{map};
 175     my $code = $sub->{'att'}->{'code'};
 176
 177     # handle unmapped tag/subs
 178     unless ($map->has($tag, $code)) {
 179         my $u = $self->{data}{umap};
 180         my $s = $self->{data}{stag};
 181         return unless (defined $s->{$tag});
 182
 183         # set a value, total-seen count and records-seen-in count
 184         $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
 185         $u->{$tag}{$code}{count}++;
 186         $u->{$tag}{$code}{rcnt}++ unless ( defined $u->{$tag}{$code}{last} and
 187                                            $u->{$tag}{$code}{last} == $self->{data}{rptr} );
 188         $u->{$tag}{$code}{last} = $self->{data}{rptr};
 189         return;
 190     }
 191
 192     # fetch our datafield struct and fieldname
 193     my $dataf = $self->{data}{crec}{tags}[-1];
 194     my $field = $map->field($tag, $code);
 195
 196     # handle modifiers, or slug data in normally
 197     if (my $mods = $map->mods($field)) {
 198         if ($mods->{multi}) {
 199             my $name = $tag . $code;
 200             push @{$dataf->{multi}{$name}}, $sub->text;
 201         }
 202     } else {
 203         die "Multiple occurances of a non-multi field: \n"
 204           if (defined $dataf->{uni}{$code});
 205         $dataf->{uni}{$code} = $sub->text;
 206     }
 207 }
 208
 209 =head1 MODIFIERS
 210
 211 MapDrivenMARCXMLProc implements the following modifiers, and passes
 212 them to L<Equinox::Migration::SubfieldMapper>, meaning that specifying
 213 any other modifiers in a MDMP map file will cause a fatal error when
 214 it is processed.
 215
 216 =head2 multi
 217
 218 If a mapping is declared to be C<multi>, then MDMP expects to see more
 219 than one instance of that subfield per datafield, and the data is
 220 handled accordingly (see L</PARSED RECORDS> below).
 221
 222 Occurring zero or one time is legal for a C<multi> mapping.
 223
 224 A mapping which is not flagged as C<multi>, but which occurs more than
 225 once per datafield will cause a fatal error.
 226
 227 =head2 bib
 228
 229 =head2 required
 230
 231 =head1 PARSED RECORDS
 232
 233 Given:
 234
 235     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS);
 236     $rec = $m->parse_record;
 237
 238 Then C<$rec> will look like:
 239
 240     {
 241       egid   => evergreen_record_id,
 242       bib    => {
 243                   (tag_id . sub_code)1 => value1,
 244                   (tag_id . sub_code)2 => value2,
 245                   ...
 246                 },
 247       tags => [
 248                 {
 249                   tag   => tag_id,
 250                   multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
 251                   uni   => { code => value, code2 => value2, ... },
 252                 },
 253                 ...
 254               ]
 255     }
 256
 257 That is, there is an C<egid> key which points to the Evergreen ID of
 258 that record, a C<bib> key which points to a hashref, and a C<tags>
 259 key which points to an arrayref.
 260
 261 =head3 C<bib>
 262
 263 A reference to a hash which holds extracted data which occurs only
 264 once per record (and is therefore "bib-level"; the default assumption
 265 is that a tag/subfield pair can occur multiple times per record). The
 266 keys are composed of tag id and subfield code, catenated
 267 (e.g. 901c). The values are the contents of that subfield of that tag.
 268
 269 If there are no tags defined as bib-level in the mapfile, C<bib> will
 270 be C<undef>.
 271
 272 =head3 C<tags>
 273
 274 A reference to a list of anonymous hashes, one for each instance of
 275 each tag which occurs in the map.
 276
 277 Each tag hash holds its own id (e.g. C<998>), and two references to
 278 two more hashrefs, C<multi> and C<uni>.
 279
 280 The C<multi> hash holds the extracted data for tag/sub mappings which
 281 have the C<multiple> modifier on them. The keys in C<multi> are
 282 composed of the tag id and subfield code, catenated
 283 (e.g. C<901c>). The values are arrayrefs containing the content of all
 284 instances of that subfield in that instance of that tag. If no tags
 285 are defined as C<multi>, it will be C<undef>.
 286
 287 The C<uni> hash holds data for tag/sub mappings which occur only once
 288 per instance of a tag (but may occur multiple times in a record due to
 289 there being multiple instances of that tag in a record). Keys are
 290 subfield codes and values are subfield content.
 291
 292 All C<uni> subfields occuring in the map are guaranteed to be
 293 defined. Sufields which are mapped but do not occur in a particular
 294 datafield will be given a value of '' (the null string) in the current
 295 record struct. Oppose subfields which are not mapped, which will be
 296 C<undef>.
 297
 298 =head1 UNMAPPED TAGS
 299
 300 If the C<sample> argument is passed to L</new>, there will also be a
 301 structure which holds data about unmapped subfields encountered in
 302 mapped tags which are also in the declared sample set. This
 303 information is collected over the life of the object and is not reset
 304 for every record processed (as the current record data neccessarily
 305 is).
 306
 307     { tag_id => {
 308                   sub_code  => { value => VALUE,
 309                                  count => COUNT,
 310                                  rcnt => RCOUNT
 311                                },
 312                   ...
 313                 },
 314       ...
 315     }
 316
 317 For each mapped tag, for each unmapped subfield, there is a hash of
 318 data about that subfield containing
 319
 320     * value - A sample of the subfield text
 321     * count - Total number of times the subfield was seen
 322     * rcnt  - The number of records the subfield was seen in
 323
 324 =head1 AUTHOR
 325
 326 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
 327
 328 =head1 BUGS
 329
 330 Please report any bugs or feature requests to the above email address.
 331
 332 =head1 SUPPORT
 333
 334 You can find documentation for this module with the perldoc command.
 335
 336     perldoc Equinox::Migration::MapDrivenMARCXMLProc
 337
 338
 339 =head1 COPYRIGHT & LICENSE
 340
 341 Copyright 2009 Equinox, all rights reserved.
 342
 343 This program is free software; you can redistribute it and/or modify it
 344 under the same terms as Perl itself.
 345
 346
 347 =cut
 348
 349 1; # End of Equinox::Migration::MapDrivenMARCXMLProc