Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm

   1 package Equinox::Migration::MapDrivenMARCXMLProc;
   2
   3 use warnings;
   4 use strict;
   5
   6 use XML::Twig;
   7 use Equinox::Migration::SubfieldMapper 1.003;
   8
   9 # FIXME
  10 #
  11 # sample functionality should be extracted into a new module which
  12 # uses E::M::SM to drive sampling of individual datafields, and
  13 # reports ALL datafields which occur
  14 #
  15 # --sample should give the list of all datafields
  16 # --samplefile should take a SM map as teh argument and introspect the mapped datafields
  17
  18
  19 =head1 NAME
  20
  21 Equinox::Migration::MapDrivenMARCXMLProc
  22
  23 =head1 VERSION
  24
  25 Version 1.000
  26
  27 =cut
  28
  29 our $VERSION = '1.000';
  30
  31
  32 =head1 SYNOPSIS
  33
  34 Foo
  35
  36     use Equinox::Migration::MapDrivenMARCXMLProc;
  37
  38
  39 =head1 METHODS
  40
  41
  42 =head2 new
  43
  44 Takes two required arguments: C<mapfile> (which will be passed along
  45 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
  46 and C<marcfile> (the MARC data to be processed).
  47
  48     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  49                                                            marcfile => FILE );
  50
  51 There is an optional third, argument, C<sample>, which specifies a
  52 arrayref of datafields to "sample" by reporting on subfields which are
  53 found in the data but not in the map.
  54
  55     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  56                                                            marcfile => FILE,
  57                                                            sample   => \@TAGS
  58                                                          );
  59
  60 See L</UNMAPPED TAGS> for more info.
  61
  62 =cut
  63
  64 sub new {
  65     my ($class, %args) = @_;
  66
  67     my $self = bless { mods => { multi    => {},
  68                                  bib      => {},
  69                                  required => {},
  70                                },
  71                        data => { recs => undef, # X::T record objects
  72                                  rptr => 0,     # next record pointer
  73                                  crec => undef, # parsed record storage
  74                                  stag => undef, # list of tags to sample
  75                                  umap => undef, # unmapped data samples
  76                                },
  77                      }, $class;
  78
  79     # initialize map and taglist
  80     die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
  81     my @mods = keys %{$self->{mods}};
  82     $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
  83                                                             mods => \@mods );
  84     $self->{data}{tags} = $self->{map}->tags;
  85
  86     # initialize twig
  87     die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
  88     if (-r $args{marcfile}) {
  89         $self->{twig} = XML::Twig->new;
  90         $self->{twig}->parsefile($args{marcfile});
  91         my @records = $self->{twig}->root->children;
  92         $self->{data}{recs} = \@records;
  93     } else {
  94         die "Can't open marc file: $!\n";
  95     }
  96
  97     # if we have a sample arg, set up the sample set and umap hash
  98     if (defined $args{sample}) {
  99         for my $s ( @{$args{sample}})
 100           { $self->{data}{stag}{$s} = 1 }
 101         $self->{data}{umap} = {};
 102     }
 103
 104     return $self;
 105 }
 106
 107
 108 =head2 parse_record
 109
 110 Extracts data from the next record, per the mapping file. Returns a
 111 normalized datastructure (see L</format_record> for details) on
 112 success; returns 0 otherwise.
 113
 114     while (my $rec = $m->parse_record) {
 115       # handle extracted record data
 116     }
 117
 118 =cut
 119
 120 sub parse_record {
 121     my ($self) = @_;
 122
 123     # get the next record and wipe current parsed record
 124     return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
 125     my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
 126     $self->{data}{crec} = { egid => undef, bib  => undef, tags => undef };
 127
 128     my @fields = $record->children;
 129     for my $f (@fields)
 130       { $self->process_field($f) }
 131
 132     # cleanup memory and increment pointer
 133     $record->purge;
 134     $self->{data}{rptr}++;
 135
 136     # check for required fields
 137     $self->check_required;
 138
 139     return $self->{data}{crec};
 140 }
 141
 142 sub process_field {
 143     my ($self, $field) = @_;
 144     my $map = $self->{map};
 145     my $tag = $field->{'att'}->{'tag'};
 146     my $crec = $self->{data}{crec};
 147
 148     # leader
 149     unless (defined $tag) {
 150         #FIXME
 151         return;
 152     }
 153
 154     # datafields
 155     if ($tag == 903) {
 156         my $sub = $field->first_child('subfield');
 157         $crec->{egid} = $sub->text;
 158         return;
 159     }
 160     if ($map->has($tag)) {
 161         push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
 162         my @subs = $field->children('subfield');
 163         for my $sub (@subs)
 164           { $self->process_subs($tag, $sub) }
 165         # check map to ensure all declared subs have a value
 166         my $mods = $map->mods($field);
 167         for my $mappedsub ( @{ $map->subfields($tag) } ) {
 168             next if $mods->{multi};
 169             $crec->{tags}[-1]{uni}{$mappedsub} = ''
 170               unless defined $crec->{tags}[-1]{uni}{$mappedsub};
 171         }
 172     }
 173 }
 174
 175 sub process_subs {
 176     my ($self, $tag, $sub) = @_;
 177     my $map  = $self->{map};
 178     my $code = $sub->{'att'}->{'code'};
 179
 180     # handle unmapped tag/subs
 181     unless ($map->has($tag, $code)) {
 182         my $u = $self->{data}{umap};
 183         my $s = $self->{data}{stag};
 184         return unless (defined $s->{$tag});
 185
 186         # set a value, total-seen count and records-seen-in count
 187         $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
 188         $u->{$tag}{$code}{count}++;
 189         $u->{$tag}{$code}{rcnt}++ unless ( defined $u->{$tag}{$code}{last} and
 190                                            $u->{$tag}{$code}{last} == $self->{data}{rptr} );
 191         $u->{$tag}{$code}{last} = $self->{data}{rptr};
 192         return;
 193     }
 194
 195     # fetch our datafield struct and fieldname
 196     my $dataf = $self->{data}{crec}{tags}[-1];
 197     my $field = $map->field($tag, $code);
 198
 199     # handle modifiers
 200     if (my $mods = $map->mods($field)) {
 201         if ($mods->{multi}) {
 202             my $name = $tag . $code;
 203             push @{$dataf->{multi}{$name}}, $sub->text;
 204             return;
 205         }
 206     }
 207
 208     die "Multiple occurances of a non-multi field: $tag$code at rec ",($self->{data}{rptr} + 1),"\n"
 209       if (defined $dataf->{uni}{$code});
 210     $dataf->{uni}{$code} = $sub->text;
 211 }
 212
 213
 214 sub check_required {
 215     my ($self) = @_;
 216     my $mods = $self->{map}->mods;
 217     my $crec = $self->{data}{crec};
 218
 219     for my $tag_id (keys %{$mods->{required}}) {
 220         for my $code (@{$mods->{required}{$tag_id}}) {
 221             my $found = 0;
 222
 223             $found = 1 if ($crec->{bib}{($tag_id . $code)});
 224             for my $tag (@{$crec->{tags}}) {
 225                 $found = 1 if ($tag->{multi}{($tag_id . $code)});
 226                 $found = 1 if ($tag->{uni}{$code});
 227             }
 228
 229             die "Required mapping $tag_id$code not found in rec ",$self->{data}{rptr},"\n"
 230               unless ($found);
 231         }
 232     }
 233
 234 }
 235
 236 =head1 MODIFIERS
 237
 238 MapDrivenMARCXMLProc implements the following modifiers, and passes
 239 them to L<Equinox::Migration::SubfieldMapper>, meaning that specifying
 240 any other modifiers in a MDMP map file will cause a fatal error when
 241 it is processed.
 242
 243 =head2 multi
 244
 245 If a mapping is declared to be C<multi>, then MDMP expects to see more
 246 than one instance of that subfield per datafield, and the data is
 247 handled accordingly (see L</PARSED RECORDS> below).
 248
 249 Occurring zero or one time is legal for a C<multi> mapping.
 250
 251 A mapping which is not flagged as C<multi>, but which occurs more than
 252 once per datafield will cause a fatal error.
 253
 254 =head2 bib
 255
 256 The C<bib> modifier declares that a mapping is "bib-level", and should
 257 be encountered once per B<record> instead of once per B<datafield> --
 258 which is another way of saying that it occurs in a non-repeating
 259 datafield or in a controlfield.
 260
 261 =head2 required
 262
 263 By default, if a mapping does not occur in a datafield (or record, in
 264 the case of C<bib> mappings), processing continues normally. if a
 265 mapping has the C<required> modifier, however, it must appear, or a
 266 fatal error will occur.
 267
 268 =head1 PARSED RECORDS
 269
 270 Given:
 271
 272     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS);
 273     $rec = $m->parse_record;
 274
 275 Then C<$rec> will look like:
 276
 277     {
 278       egid   => evergreen_record_id,
 279       bib    => {
 280                   (tag_id . sub_code)1 => value1,
 281                   (tag_id . sub_code)2 => value2,
 282                   ...
 283                 },
 284       tags => [
 285                 {
 286                   tag   => tag_id,
 287                   multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
 288                   uni   => { code => value, code2 => value2, ... },
 289                 },
 290                 ...
 291               ]
 292     }
 293
 294 That is, there is an C<egid> key which points to the Evergreen ID of
 295 that record, a C<bib> key which points to a hashref, and a C<tags>
 296 key which points to an arrayref.
 297
 298 =head3 C<bib>
 299
 300 A reference to a hash which holds extracted data which occurs only
 301 once per record (and is therefore "bib-level"; the default assumption
 302 is that a tag/subfield pair can occur multiple times per record). The
 303 keys are composed of tag id and subfield code, catenated
 304 (e.g. 901c). The values are the contents of that subfield of that tag.
 305
 306 If there are no tags defined as bib-level in the mapfile, C<bib> will
 307 be C<undef>.
 308
 309 =head3 C<tags>
 310
 311 A reference to a list of anonymous hashes, one for each instance of
 312 each tag which occurs in the map.
 313
 314 Each tag hash holds its own id (e.g. C<998>), and two references to
 315 two more hashrefs, C<multi> and C<uni>.
 316
 317 The C<multi> hash holds the extracted data for tag/sub mappings which
 318 have the C<multiple> modifier on them. The keys in C<multi> are
 319 composed of the tag id and subfield code, catenated
 320 (e.g. C<901c>). The values are arrayrefs containing the content of all
 321 instances of that subfield in that instance of that tag. If no tags
 322 are defined as C<multi>, it will be C<undef>.
 323
 324 The C<uni> hash holds data for tag/sub mappings which occur only once
 325 per instance of a tag (but may occur multiple times in a record due to
 326 there being multiple instances of that tag in a record). Keys are
 327 subfield codes and values are subfield content.
 328
 329 All C<uni> subfields occuring in the map are guaranteed to be
 330 defined. Sufields which are mapped but do not occur in a particular
 331 datafield will be given a value of '' (the null string) in the current
 332 record struct. Oppose subfields which are not mapped, which will be
 333 C<undef>.
 334
 335 =head1 UNMAPPED TAGS
 336
 337 If the C<sample> argument is passed to L</new>, there will also be a
 338 structure which holds data about unmapped subfields encountered in
 339 mapped tags which are also in the declared sample set. This
 340 information is collected over the life of the object and is not reset
 341 for every record processed (as the current record data neccessarily
 342 is).
 343
 344     { tag_id => {
 345                   sub_code  => { value => VALUE,
 346                                  count => COUNT,
 347                                  rcnt => RCOUNT
 348                                },
 349                   ...
 350                 },
 351       ...
 352     }
 353
 354 For each mapped tag, for each unmapped subfield, there is a hash of
 355 data about that subfield containing
 356
 357     * value - A sample of the subfield text
 358     * count - Total number of times the subfield was seen
 359     * rcnt  - The number of records the subfield was seen in
 360
 361 =head1 AUTHOR
 362
 363 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
 364
 365 =head1 BUGS
 366
 367 Please report any bugs or feature requests to the above email address.
 368
 369 =head1 SUPPORT
 370
 371 You can find documentation for this module with the perldoc command.
 372
 373     perldoc Equinox::Migration::MapDrivenMARCXMLProc
 374
 375
 376 =head1 COPYRIGHT & LICENSE
 377
 378 Copyright 2009 Equinox, all rights reserved.
 379
 380 This program is free software; you can redistribute it and/or modify it
 381 under the same terms as Perl itself.
 382
 383
 384 =cut
 385
 386 1; # End of Equinox::Migration::MapDrivenMARCXMLProc