Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm

   1 package Equinox::Migration::MapDrivenMARCXMLProc;
   2
   3 use warnings;
   4 use strict;
   5
   6 use XML::Twig;
   7 use Equinox::Migration::SubfieldMapper 1.002;
   8
   9 # FIXME
  10 #
  11 # sample functionality should be extracted into a new module which
  12 # uses E::M::SM to drive sampling of individual datafields, and
  13 # reports ALL datafields which occur
  14 #
  15 # --sample should give the list of all datafields
  16 # --samplefile should take a SM map as teh argument and introspect the mapped datafields
  17
  18
  19 =head1 NAME
  20
  21 Equinox::Migration::MapDrivenMARCXMLProc
  22
  23 =head1 VERSION
  24
  25 Version 1.000
  26
  27 =cut
  28
  29 our $VERSION = '1.000';
  30
  31
  32 =head1 SYNOPSIS
  33
  34 Foo
  35
  36     use Equinox::Migration::MapDrivenMARCXMLProc;
  37
  38
  39 =head1 METHODS
  40
  41
  42 =head2 new
  43
  44 Takes two required arguments: C<mapfile> (which will be passed along
  45 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
  46 and C<marcfile> (the MARC data to be processed).
  47
  48     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  49                                                            marcfile => FILE );
  50
  51 There is an optional third, argument, C<sample>, which specifies a
  52 arrayref of datafields to "sample" by reporting on subfields which are
  53 found in the data but not in the map.
  54
  55     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  56                                                            marcfile => FILE,
  57                                                            sample   => \@TAGS
  58                                                          );
  59
  60 See L</UNMAPPED TAGS> for more info.
  61
  62 =cut
  63
  64 sub new {
  65     my ($class, %args) = @_;
  66
  67     my $self = bless { mods => { multi    => {},
  68                                  once     => {},
  69                                  required => {},
  70                                },
  71                        data => { recs => undef, # X::T record objects
  72                                  rptr => 0,     # next record pointer
  73                                  crec => undef, # parsed record storage
  74                                  stag => undef, # list of tags to sample
  75                                  umap => undef, # unmapped data samples
  76                                },
  77                      }, $class;
  78
  79     # initialize map and taglist
  80     die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
  81     my @mods = keys %{$self->{mods}};
  82     $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
  83                                                             mods => \@mods );
  84     $self->{data}{tags} = $self->{map}->tags;
  85
  86     # initialize twig
  87     die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
  88     if (-r $args{marcfile}) {
  89         $self->{twig} = XML::Twig->new;
  90         $self->{twig}->parsefile($args{marcfile});
  91         my @records = $self->{twig}->root->children;
  92         $self->{data}{recs} = \@records;
  93     } else {
  94         die "Can't open marc file: $!\n";
  95     }
  96
  97     # if we have a sample arg, set up the sample set and umap hash
  98     if (defined $args{sample}) {
  99         for my $s ( @{$args{sample}})
 100           { $self->{data}{stag}{$s} = 1 }
 101         $self->{data}{umap} = {};
 102     }
 103
 104     return $self;
 105 }
 106
 107
 108 =head2 parse_record
 109
 110 Extracts data from the next record, per the mapping file. Returns a
 111 normalized datastructure (see L</format_record> for details) on
 112 success; returns 0 otherwise.
 113
 114     while (my $rec = $m->parse_record) {
 115       # handle extracted record data
 116     }
 117
 118 =cut
 119
 120 sub parse_record {
 121     my ($self) = @_;
 122
 123     # get the next record and wipe current parsed record
 124     return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
 125     my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
 126     $self->{data}{crec} = { egid => undef, bib  => undef, tags => undef };
 127
 128     my @fields = $record->children;
 129     for my $f (@fields)
 130       { $self->process_field($f) }
 131
 132     # cleanup memory and increment pointer
 133     $record->purge;
 134     $self->{data}{rptr}++;
 135
 136     # FIXME check for required fields here
 137
 138     return $self->{data}{crec};
 139 }
 140
 141 sub process_field {
 142     my ($self, $field) = @_;
 143     my $map = $self->{map};
 144     my $tag = $field->{'att'}->{'tag'};
 145     my $crec = $self->{data}{crec};
 146
 147     # leader
 148     unless (defined $tag) {
 149         #FIXME
 150         return;
 151     }
 152
 153     # datafields
 154     if ($tag == 903) {
 155         my $sub = $field->first_child('subfield');
 156         $crec->{egid} = $sub->text;
 157         return;
 158     }
 159     if ($map->has($tag)) {
 160         push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
 161         my @subs = $field->children('subfield');
 162         for my $sub (@subs)
 163           { $self->process_subs($tag, $sub) }
 164         # check map to ensure all declared subs have a value
 165         my $mods = $map->mods($field);
 166         for my $mappedsub ( @{ $map->subfields($tag) } ) {
 167             next if $mods->{multi};
 168             $crec->{tags}[-1]{uni}{$mappedsub} = ''
 169               unless defined $crec->{tags}[-1]{uni}{$mappedsub};
 170         }
 171     }
 172 }
 173
 174 sub process_subs {
 175     my ($self, $tag, $sub) = @_;
 176     my $map  = $self->{map};
 177     my $code = $sub->{'att'}->{'code'};
 178
 179     # handle unmapped tag/subs
 180     unless ($map->has($tag, $code)) {
 181         my $u = $self->{data}{umap};
 182         my $s = $self->{data}{stag};
 183         return unless (defined $s->{$tag});
 184
 185         # set a value, total-seen count and records-seen-in count
 186         $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
 187         $u->{$tag}{$code}{count}++;
 188         $u->{$tag}{$code}{rcnt}++ unless ( defined $u->{$tag}{$code}{last} and
 189                                            $u->{$tag}{$code}{last} == $self->{data}{rptr} );
 190         $u->{$tag}{$code}{last} = $self->{data}{rptr};
 191         return;
 192     }
 193
 194     # fetch our datafield struct and fieldname
 195     my $dataf = $self->{data}{crec}{tags}[-1];
 196     my $field = $map->field($tag, $code);
 197
 198     # handle modifiers, or slug data in normally
 199     if (my $mods = $map->mods($field)) {
 200         if ($mods->{multi}) {
 201             my $name = $tag . $code;
 202             push @{$dataf->{multi}{$name}}, $sub->text;
 203         }
 204     } else {
 205         die "Multiple occurances of a non-multi field: $tag$code at rec ",($self->{data}{rptr} + 1),"\n"
 206           if (defined $dataf->{uni}{$code});
 207         $dataf->{uni}{$code} = $sub->text;
 208     }
 209 }
 210
 211 =head1 MODIFIERS
 212
 213 MapDrivenMARCXMLProc implements the following modifiers, and passes
 214 them to L<Equinox::Migration::SubfieldMapper>, meaning that specifying
 215 any other modifiers in a MDMP map file will cause a fatal error when
 216 it is processed.
 217
 218 =head2 multi
 219
 220 If a mapping is declared to be C<multi>, then MDMP expects to see more
 221 than one instance of that subfield per datafield, and the data is
 222 handled accordingly (see L</PARSED RECORDS> below).
 223
 224 Occurring zero or one time is legal for a C<multi> mapping.
 225
 226 A mapping which is not flagged as C<multi>, but which occurs more than
 227 once per datafield will cause a fatal error.
 228
 229 =head2 bib
 230
 231 The C<bib> modifier declares that a mapping is "bib-level", and should
 232 be encountered once per B<record> instead of once per B<datafield> --
 233 which is another way of saying that it occurs in a non-repeating
 234 datafield or in a controlfield.
 235
 236 =head2 required
 237
 238 By default, if a mapping does not occur in a datafield (or record, in
 239 the case of C<bib> mappings), processing continues normally. if a
 240 mapping has the C<required> modifier, however, it must appear, or a
 241 fatal error will occur.
 242
 243 =head1 PARSED RECORDS
 244
 245 Given:
 246
 247     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS);
 248     $rec = $m->parse_record;
 249
 250 Then C<$rec> will look like:
 251
 252     {
 253       egid   => evergreen_record_id,
 254       bib    => {
 255                   (tag_id . sub_code)1 => value1,
 256                   (tag_id . sub_code)2 => value2,
 257                   ...
 258                 },
 259       tags => [
 260                 {
 261                   tag   => tag_id,
 262                   multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
 263                   uni   => { code => value, code2 => value2, ... },
 264                 },
 265                 ...
 266               ]
 267     }
 268
 269 That is, there is an C<egid> key which points to the Evergreen ID of
 270 that record, a C<bib> key which points to a hashref, and a C<tags>
 271 key which points to an arrayref.
 272
 273 =head3 C<bib>
 274
 275 A reference to a hash which holds extracted data which occurs only
 276 once per record (and is therefore "bib-level"; the default assumption
 277 is that a tag/subfield pair can occur multiple times per record). The
 278 keys are composed of tag id and subfield code, catenated
 279 (e.g. 901c). The values are the contents of that subfield of that tag.
 280
 281 If there are no tags defined as bib-level in the mapfile, C<bib> will
 282 be C<undef>.
 283
 284 =head3 C<tags>
 285
 286 A reference to a list of anonymous hashes, one for each instance of
 287 each tag which occurs in the map.
 288
 289 Each tag hash holds its own id (e.g. C<998>), and two references to
 290 two more hashrefs, C<multi> and C<uni>.
 291
 292 The C<multi> hash holds the extracted data for tag/sub mappings which
 293 have the C<multiple> modifier on them. The keys in C<multi> are
 294 composed of the tag id and subfield code, catenated
 295 (e.g. C<901c>). The values are arrayrefs containing the content of all
 296 instances of that subfield in that instance of that tag. If no tags
 297 are defined as C<multi>, it will be C<undef>.
 298
 299 The C<uni> hash holds data for tag/sub mappings which occur only once
 300 per instance of a tag (but may occur multiple times in a record due to
 301 there being multiple instances of that tag in a record). Keys are
 302 subfield codes and values are subfield content.
 303
 304 All C<uni> subfields occuring in the map are guaranteed to be
 305 defined. Sufields which are mapped but do not occur in a particular
 306 datafield will be given a value of '' (the null string) in the current
 307 record struct. Oppose subfields which are not mapped, which will be
 308 C<undef>.
 309
 310 =head1 UNMAPPED TAGS
 311
 312 If the C<sample> argument is passed to L</new>, there will also be a
 313 structure which holds data about unmapped subfields encountered in
 314 mapped tags which are also in the declared sample set. This
 315 information is collected over the life of the object and is not reset
 316 for every record processed (as the current record data neccessarily
 317 is).
 318
 319     { tag_id => {
 320                   sub_code  => { value => VALUE,
 321                                  count => COUNT,
 322                                  rcnt => RCOUNT
 323                                },
 324                   ...
 325                 },
 326       ...
 327     }
 328
 329 For each mapped tag, for each unmapped subfield, there is a hash of
 330 data about that subfield containing
 331
 332     * value - A sample of the subfield text
 333     * count - Total number of times the subfield was seen
 334     * rcnt  - The number of records the subfield was seen in
 335
 336 =head1 AUTHOR
 337
 338 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
 339
 340 =head1 BUGS
 341
 342 Please report any bugs or feature requests to the above email address.
 343
 344 =head1 SUPPORT
 345
 346 You can find documentation for this module with the perldoc command.
 347
 348     perldoc Equinox::Migration::MapDrivenMARCXMLProc
 349
 350
 351 =head1 COPYRIGHT & LICENSE
 352
 353 Copyright 2009 Equinox, all rights reserved.
 354
 355 This program is free software; you can redistribute it and/or modify it
 356 under the same terms as Perl itself.
 357
 358
 359 =cut
 360
 361 1; # End of Equinox::Migration::MapDrivenMARCXMLProc