Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm

   1 package Equinox::Migration::MapDrivenMARCXMLProc;
   2
   3 use warnings;
   4 use strict;
   5
   6 use XML::Twig;
   7 use Equinox::Migration::SubfieldMapper 1.003;
   8
   9 # FIXME
  10 #
  11 # sample functionality should be extracted into a new module which
  12 # uses E::M::SM to drive sampling of individual datafields, and
  13 # reports ALL datafields which occur
  14 #
  15 # --sample should give the list of all datafields
  16 # --samplefile should take a SM map as teh argument and introspect the mapped datafields
  17
  18
  19 =head1 NAME
  20
  21 Equinox::Migration::MapDrivenMARCXMLProc
  22
  23 =head1 VERSION
  24
  25 Version 1.000
  26
  27 =cut
  28
  29 our $VERSION = '1.000';
  30
  31
  32 =head1 SYNOPSIS
  33
  34 Foo
  35
  36     use Equinox::Migration::MapDrivenMARCXMLProc;
  37
  38
  39 =head1 METHODS
  40
  41
  42 =head2 new
  43
  44 Takes two required arguments: C<mapfile> (which will be passed along
  45 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
  46 and C<marcfile> (the MARC data to be processed).
  47
  48     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  49                                                            marcfile => FILE );
  50
  51 =cut
  52
  53 sub new {
  54     my ($class, %args) = @_;
  55
  56     my $self = bless { mods => { multi    => {},
  57                                  bib      => {},
  58                                  required => {},
  59                                },
  60                        data => { recs => undef, # X::T record objects
  61                                  rptr => 0,     # next record pointer
  62                                  crec => undef, # parsed record storage
  63                                },
  64                      }, $class;
  65
  66     # initialize map and taglist
  67     die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
  68     my @mods = keys %{$self->{mods}};
  69     $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
  70                                                             mods => \@mods );
  71     $self->{data}{tags} = $self->{map}->tags;
  72
  73     # initialize twig
  74     die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
  75     if (-r $args{marcfile}) {
  76         $self->{twig} = XML::Twig->new;
  77         $self->{twig}->parsefile($args{marcfile});
  78         my @records = $self->{twig}->root->children;
  79         $self->{data}{recs} = \@records;
  80     } else {
  81         die "Can't open marc file: $!\n";
  82     }
  83
  84     return $self;
  85 }
  86
  87
  88 =head2 parse_record
  89
  90 Extracts data from the next record, per the mapping file. Returns a
  91 normalized datastructure (see L</format_record> for details) on
  92 success; returns 0 otherwise.
  93
  94     while (my $rec = $m->parse_record) {
  95       # handle extracted record data
  96     }
  97
  98 =cut
  99
 100 sub parse_record {
 101     my ($self) = @_;
 102
 103     # get the next record and wipe current parsed record
 104     return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
 105     my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
 106     $self->{data}{crec} = { egid => undef, bib  => undef, tags => undef };
 107
 108     my @fields = $record->children;
 109     for my $f (@fields)
 110       { $self->process_field($f) }
 111
 112     # cleanup memory and increment pointer
 113     $record->purge;
 114     $self->{data}{rptr}++;
 115
 116     # check for required fields
 117     $self->check_required;
 118
 119     return $self->{data}{crec};
 120 }
 121
 122 sub process_field {
 123     my ($self, $field) = @_;
 124     my $map = $self->{map};
 125     my $tag = $field->{'att'}->{'tag'};
 126     my $crec = $self->{data}{crec};
 127
 128     # leader
 129     unless (defined $tag) {
 130         #FIXME
 131         return;
 132     }
 133
 134     # datafields
 135     if ($tag == 903) {
 136         my $sub = $field->first_child('subfield');
 137         $crec->{egid} = $sub->text;
 138         return;
 139     }
 140     if ($map->has($tag)) {
 141         push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
 142         my @subs = $field->children('subfield');
 143         for my $sub (@subs)
 144           { $self->process_subs($tag, $sub) }
 145         # check map to ensure all declared subs have a value
 146         my $mods = $map->mods($field);
 147         for my $mappedsub ( @{ $map->subfields($tag) } ) {
 148             next if $mods->{multi};
 149             $crec->{tags}[-1]{uni}{$mappedsub} = ''
 150               unless defined $crec->{tags}[-1]{uni}{$mappedsub};
 151         }
 152     }
 153 }
 154
 155 sub process_subs {
 156     my ($self, $tag, $sub) = @_;
 157     my $map  = $self->{map};
 158     my $code = $sub->{'att'}->{'code'};
 159
 160     # handle unmapped tag/subs
 161     return unless ($map->has($tag, $code));
 162
 163     # fetch our datafield struct and fieldname
 164     my $dataf = $self->{data}{crec}{tags}[-1];
 165     my $field = $map->field($tag, $code);
 166
 167     # handle modifiers
 168     if (my $mods = $map->mods($field)) {
 169         if ($mods->{multi}) {
 170             my $name = $tag . $code;
 171             push @{$dataf->{multi}{$name}}, $sub->text;
 172             return;
 173         }
 174     }
 175
 176     die "Multiple occurances of a non-multi field: $tag$code at rec ",
 177       ($self->{data}{rptr} + 1),"\n" if (defined $dataf->{uni}{$code});
 178     $dataf->{uni}{$code} = $sub->text;
 179 }
 180
 181
 182 sub check_required {
 183     my ($self) = @_;
 184     my $mods = $self->{map}->mods;
 185     my $crec = $self->{data}{crec};
 186
 187     for my $tag_id (keys %{$mods->{required}}) {
 188         for my $code (@{$mods->{required}{$tag_id}}) {
 189             my $found = 0;
 190
 191             $found = 1 if ($crec->{bib}{($tag_id . $code)});
 192             for my $tag (@{$crec->{tags}}) {
 193                 $found = 1 if ($tag->{multi}{($tag_id . $code)});
 194                 $found = 1 if ($tag->{uni}{$code});
 195             }
 196
 197             die "Required mapping $tag_id$code not found in rec ",$self->{data}{rptr},"\n"
 198               unless ($found);
 199         }
 200     }
 201
 202 }
 203
 204 =head1 MODIFIERS
 205
 206 MapDrivenMARCXMLProc implements the following modifiers, and passes
 207 them to L<Equinox::Migration::SubfieldMapper>, meaning that specifying
 208 any other modifiers in a MDMP map file will cause a fatal error when
 209 it is processed.
 210
 211 =head2 multi
 212
 213 If a mapping is declared to be C<multi>, then MDMP expects to see more
 214 than one instance of that subfield per datafield, and the data is
 215 handled accordingly (see L</PARSED RECORDS> below).
 216
 217 Occurring zero or one time is legal for a C<multi> mapping.
 218
 219 A mapping which is not flagged as C<multi>, but which occurs more than
 220 once per datafield will cause a fatal error.
 221
 222 =head2 bib
 223
 224 The C<bib> modifier declares that a mapping is "bib-level", and should
 225 be encountered once per B<record> instead of once per B<datafield> --
 226 which is another way of saying that it occurs in a non-repeating
 227 datafield or in a controlfield.
 228
 229 =head2 required
 230
 231 By default, if a mapping does not occur in a datafield (or record, in
 232 the case of C<bib> mappings), processing continues normally. if a
 233 mapping has the C<required> modifier, however, it must appear, or a
 234 fatal error will occur.
 235
 236 =head1 PARSED RECORDS
 237
 238 Given:
 239
 240     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS);
 241     $rec = $m->parse_record;
 242
 243 Then C<$rec> will look like:
 244
 245     {
 246       egid   => evergreen_record_id,
 247       bib    => {
 248                   (tag_id . sub_code)1 => value1,
 249                   (tag_id . sub_code)2 => value2,
 250                   ...
 251                 },
 252       tags => [
 253                 {
 254                   tag   => tag_id,
 255                   multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
 256                   uni   => { code => value, code2 => value2, ... },
 257                 },
 258                 ...
 259               ]
 260     }
 261
 262 That is, there is an C<egid> key which points to the Evergreen ID of
 263 that record, a C<bib> key which points to a hashref, and a C<tags>
 264 key which points to an arrayref.
 265
 266 =head3 C<bib>
 267
 268 A reference to a hash which holds extracted data which occurs only
 269 once per record (and is therefore "bib-level"; the default assumption
 270 is that a tag/subfield pair can occur multiple times per record). The
 271 keys are composed of tag id and subfield code, catenated
 272 (e.g. 901c). The values are the contents of that subfield of that tag.
 273
 274 If there are no tags defined as bib-level in the mapfile, C<bib> will
 275 be C<undef>.
 276
 277 =head3 C<tags>
 278
 279 A reference to a list of anonymous hashes, one for each instance of
 280 each tag which occurs in the map.
 281
 282 Each tag hash holds its own id (e.g. C<998>), and two references to
 283 two more hashrefs, C<multi> and C<uni>.
 284
 285 The C<multi> hash holds the extracted data for tag/sub mappings which
 286 have the C<multiple> modifier on them. The keys in C<multi> are
 287 composed of the tag id and subfield code, catenated
 288 (e.g. C<901c>). The values are arrayrefs containing the content of all
 289 instances of that subfield in that instance of that tag. If no tags
 290 are defined as C<multi>, it will be C<undef>.
 291
 292 The C<uni> hash holds data for tag/sub mappings which occur only once
 293 per instance of a tag (but may occur multiple times in a record due to
 294 there being multiple instances of that tag in a record). Keys are
 295 subfield codes and values are subfield content.
 296
 297 All C<uni> subfields occuring in the map are guaranteed to be
 298 defined. Sufields which are mapped but do not occur in a particular
 299 datafield will be given a value of '' (the null string) in the current
 300 record struct. Oppose subfields which are not mapped, which will be
 301 C<undef>.
 302
 303
 304 =head1 AUTHOR
 305
 306 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
 307
 308 =head1 BUGS
 309
 310 Please report any bugs or feature requests to the above email address.
 311
 312 =head1 SUPPORT
 313
 314 You can find documentation for this module with the perldoc command.
 315
 316     perldoc Equinox::Migration::MapDrivenMARCXMLProc
 317
 318
 319 =head1 COPYRIGHT & LICENSE
 320
 321 Copyright 2009 Equinox, all rights reserved.
 322
 323 This program is free software; you can redistribute it and/or modify it
 324 under the same terms as Perl itself.
 325
 326
 327 =cut
 328
 329 1; # End of Equinox::Migration::MapDrivenMARCXMLProc