Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm

   1 package Equinox::Migration::MapDrivenMARCXMLProc;
   2
   3 use warnings;
   4 use strict;
   5
   6 use XML::Twig;
   7 use Equinox::Migration::SubfieldMapper 1.002;
   8
   9 =head1 NAME
  10
  11 Equinox::Migration::MapDrivenMARCXMLProc
  12
  13 =head1 VERSION
  14
  15 Version 1.000
  16
  17 =cut
  18
  19 our $VERSION = '1.000';
  20
  21
  22 =head1 SYNOPSIS
  23
  24 Foo
  25
  26     use Equinox::Migration::MapDrivenMARCXMLProc;
  27
  28
  29 =head1 METHODS
  30
  31
  32 =head2 new
  33
  34 Takes two required arguments: C<mapfile> (which will be passed along
  35 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
  36 and C<marcfile> (the MARC data to be processed).
  37
  38     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  39                                                            marcfile => FILE );
  40
  41 There is an optional third, argument, C<sample>, which specifies a
  42 arrayref of datafields to "sample" by reporting on subfields which are
  43 found in the data but not in the map.
  44
  45     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  46                                                            marcfile => FILE,
  47                                                            sample   => \@TAGS
  48                                                          );
  49
  50 See L</UNMAPPED TAGS> for more info.
  51
  52 =cut
  53
  54 sub new {
  55     my ($class, %args) = @_;
  56
  57     my $self = bless { mods => { multi    => {},
  58                                  once     => {},
  59                                  required => {},
  60                                },
  61                        data => { recs => undef, # X::T record objects
  62                                  rptr => 0,     # next record pointer
  63                                  crec => undef, # parsed record storage
  64                                  stag => undef, # list of tags to sample
  65                                  umap => undef, # unmapped data samples
  66                                },
  67                      }, $class;
  68
  69     # initialize map and taglist
  70     die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
  71     my @mods = keys %{$self->{mods}};
  72     $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
  73                                                             mods => \@mods );
  74     $self->{data}{tags} = $self->{map}->tags;
  75
  76     # initialize twig
  77     die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
  78     if (-r $args{marcfile}) {
  79         $self->{twig} = XML::Twig->new;
  80         $self->{twig}->parsefile($args{marcfile});
  81         my @records = $self->{twig}->root->children;
  82         $self->{data}{recs} = \@records;
  83     } else {
  84         die "Can't open marc file: $!\n";
  85     }
  86
  87     # if we have a sample arg, set up the sample set and umap hash
  88     if (defined $args{sample}) {
  89         for my $s ( @{$args{sample}})
  90           { $self->{data}{stag}{$s} = 1 }
  91         $self->{data}{umap} = {};
  92     }
  93
  94     return $self;
  95 }
  96
  97
  98 =head2 parse_record
  99
 100 Extracts data from the next record, per the mapping file. Returns a
 101 normalized datastructure (see L</format_record> for details) on
 102 success; returns 0 otherwise.
 103
 104     while (my $rec = $m->parse_record) {
 105       # handle extracted record data
 106     }
 107
 108 =cut
 109
 110 sub parse_record {
 111     my ($self) = @_;
 112
 113     # get the next record and wipe current parsed record
 114     return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
 115     my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
 116     $self->{data}{crec} = { egid => undef, bib  => undef, tags => undef };
 117
 118     my @fields = $record->children;
 119     for my $f (@fields)
 120       { $self->process_field($f) }
 121
 122     # cleanup memory and increment pointer
 123     $record->purge;
 124     $self->{data}{rptr}++;
 125
 126     return $self->{data}{crec};
 127 }
 128
 129 sub process_field {
 130     my ($self, $field) = @_;
 131     my $map = $self->{map};
 132     my $tag = $field->{'att'}->{'tag'};
 133     my $crec = $self->{data}{crec};
 134
 135     # leader
 136     unless (defined $tag) {
 137         #FIXME
 138         return;
 139     }
 140
 141     # datafields
 142     if ($tag == 903) {
 143         my $sub = $field->first_child('subfield');
 144         $crec->{egid} = $sub->text;
 145         return;
 146     }
 147     if ($map->has($tag)) {
 148         push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
 149         my @subs = $field->children('subfield');
 150         for my $sub (@subs)
 151           { $self->process_subs($tag, $sub) }
 152         # check map to ensure all declared subs have a value
 153         my $mods = $map->mods($field);
 154         for my $mappedsub ( @{ $map->subfields($tag) } ) {
 155             next if $mods->{multi};
 156             $crec->{tags}[-1]{uni}{$mappedsub} = ''
 157               unless defined $crec->{tags}[-1]{uni}{$mappedsub};
 158         }
 159     }
 160 }
 161
 162 sub process_subs {
 163     my ($self, $tag, $sub) = @_;
 164     my $map  = $self->{map};
 165     my $code = $sub->{'att'}->{'code'};
 166
 167     # handle unmapped tag/subs
 168     unless ($map->has($tag, $code)) {
 169         my $u = $self->{data}{umap};
 170         my $s = $self->{data}{stag};
 171         return unless (defined $s->{$tag});
 172
 173         # set a value, total-seen count and records-seen-in count
 174         $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
 175         $u->{$tag}{$code}{count}++;
 176         $u->{$tag}{$code}{rcnt}++ unless ( defined $u->{$tag}{$code}{last} and
 177                                            $u->{$tag}{$code}{last} == $self->{data}{rptr} );
 178         $u->{$tag}{$code}{last} = $self->{data}{rptr};
 179         return;
 180     }
 181
 182     # fetch our datafield struct and fieldname
 183     my $dataf = $self->{data}{crec}{tags}[-1];
 184     my $field = $map->field($tag, $code);
 185
 186     # handle modifiers, or slug data in normally
 187     if (my $mods = $map->mods($field)) {
 188         if ($mods->{multi}) {
 189             my $name = $tag . $code;
 190             push @{$dataf->{multi}{$name}}, $sub->text;
 191         }
 192     } else {
 193         $dataf->{uni}{$code} = $sub->text;
 194     }
 195 }
 196
 197 =head1 PARSED RECORDS
 198
 199 Given:
 200
 201     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS);
 202     $rec = $m->parse_record;
 203
 204 Then C<$rec> will look like:
 205
 206     {
 207       egid   => evergreen_record_id,
 208       bib    => {
 209                   (tag_id . sub_code)1 => value1,
 210                   (tag_id . sub_code)2 => value2,
 211                   ...
 212                 },
 213       tags => [
 214                 {
 215                   tag   => tag_id,
 216                   multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
 217                   uni   => { code => value, code2 => value2, ... },
 218                 },
 219                 ...
 220               ]
 221     }
 222
 223 That is, there is an C<egid> key which points to the Evergreen ID of
 224 that record, a C<bib> key which points to a hashref, and a C<tags>
 225 key which points to an arrayref.
 226
 227 =head3 C<bib>
 228
 229 A reference to a hash which holds extracted data which occurs only
 230 once per record (and is therefore "bib-level"; the default assumption
 231 is that a tag/subfield pair can occur multiple times per record). The
 232 keys are composed of tag id and subfield code, catenated
 233 (e.g. 901c). The values are the contents of that subfield of that tag.
 234
 235 If there are no tags defined as bib-level in the mapfile, C<bib> will
 236 be C<undef>.
 237
 238 =head3 C<tags>
 239
 240 A reference to a list of anonymous hashes, one for each instance of
 241 each tag which occurs in the map.
 242
 243 Each tag hash holds its own id (e.g. C<998>), and two references to
 244 two more hashrefs, C<multi> and C<uni>.
 245
 246 The C<multi> hash holds the extracted data for tag/sub mappings which
 247 have the C<multiple> modifier on them. The keys in C<multi> are
 248 composed of the tag id and subfield code, catenated
 249 (e.g. C<901c>). The values are arrayrefs containing the content of all
 250 instances of that subfield in that instance of that tag. If no tags
 251 are defined as C<multi>, it will be C<undef>.
 252
 253 The C<uni> hash holds data for tag/sub mappings which occur only once
 254 per instance of a tag (but may occur multiple times in a record due to
 255 there being multiple instances of that tag in a record). Keys are
 256 subfield codes and values are subfield content.
 257
 258 All C<uni> subfields occuring in the map are guaranteed to be
 259 defined. Sufields which are mapped but do not occur in a particular
 260 datafield will be given a value of '' (the null string) in the current
 261 record struct. Oppose subfields which are not mapped, which will be
 262 C<undef>.
 263
 264 =head1 UNMAPPED TAGS
 265
 266 If the C<sample> argument is passed to L</new>, there will also be a
 267 structure which holds data about unmapped subfields encountered in
 268 mapped tags which are also in the declared sample set. This
 269 information is collected over the life of the object and is not reset
 270 for every record processed (as the current record data neccessarily
 271 is).
 272
 273     { tag_id => {
 274                   sub_code  => { value => VALUE,
 275                                  count => COUNT,
 276                                  rcnt => RCOUNT
 277                                },
 278                   ...
 279                 },
 280       ...
 281     }
 282
 283 For each mapped tag, for each unmapped subfield, there is a hash of
 284 data about that subfield containing
 285
 286     * value - A sample of the subfield text
 287     * count - Total number of times the subfield was seen
 288     * rcnt  - The number of records the subfield was seen in
 289
 290 =head1 AUTHOR
 291
 292 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
 293
 294 =head1 BUGS
 295
 296 Please report any bugs or feature requests to the above email address.
 297
 298 =head1 SUPPORT
 299
 300 You can find documentation for this module with the perldoc command.
 301
 302     perldoc Equinox::Migration::MapDrivenMARCXMLProc
 303
 304
 305 =head1 COPYRIGHT & LICENSE
 306
 307 Copyright 2009 Equinox, all rights reserved.
 308
 309 This program is free software; you can redistribute it and/or modify it
 310 under the same terms as Perl itself.
 311
 312
 313 =cut
 314
 315 1; # End of Equinox::Migration::MapDrivenMARCXMLProc