Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm

   1 package Equinox::Migration::MapDrivenMARCXMLProc;
   2
   3 use warnings;
   4 use strict;
   5
   6 use XML::Twig;
   7 use Equinox::Migration::SubfieldMapper 1.002;
   8
   9 =head1 NAME
  10
  11 Equinox::Migration::MapDrivenMARCXMLProc
  12
  13 =head1 VERSION
  14
  15 Version 1.000
  16
  17 =cut
  18
  19 our $VERSION = '1.000';
  20
  21
  22 =head1 SYNOPSIS
  23
  24 Foo
  25
  26     use Equinox::Migration::MapDrivenMARCXMLProc;
  27
  28
  29 =head1 METHODS
  30
  31
  32 =head2 new
  33
  34 Takes two required arguments: C<mapfile> (which will be passed along
  35 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
  36 and C<marcfile> (the MARC data to be processed).
  37
  38     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  39                                                            marcfile => FILE );
  40
  41 There is an optional third, argument, C<sample>, which specifies a
  42 arrayref of datafields to "sample" by reporting on subfields which are
  43 found in the data but not in the map.
  44
  45     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  46                                                            marcfile => FILE,
  47                                                            sample   => \@TAGS
  48                                                          );
  49
  50 See L</UNMAPPED TAGS> for more info.
  51
  52 =cut
  53
  54 sub new {
  55     my ($class, %args) = @_;
  56
  57     my $self = bless { mods => { multi    => {},
  58                                  once     => {},
  59                                  required => {},
  60                                },
  61                        data => { recs => undef, # X::T record objects
  62                                  rptr => 0,     # next record pointer
  63                                  crec => undef, # parsed record storage
  64                                  stag => undef, # list of tags to sample
  65                                  umap => undef, # unmapped data samples
  66                                },
  67                      }, $class;
  68
  69     # initialize map and taglist
  70     die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
  71     my @mods = keys %{$self->{mods}};
  72     $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
  73                                                             mods => \@mods );
  74     $self->{data}{tags} = $self->{map}->tags;
  75
  76     # initialize twig
  77     die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
  78     if (-r $args{marcfile}) {
  79         $self->{twig} = XML::Twig->new;
  80         $self->{twig}->parsefile($args{marcfile});
  81         my @records = $self->{twig}->root->children;
  82         $self->{data}{recs} = \@records;
  83     } else {
  84         die "Can't open marc file: $!\n";
  85     }
  86
  87     return $self;
  88 }
  89
  90
  91 =head2 parse_record
  92
  93 Extracts data from the next record, per the mapping file. Returns a
  94 normalized datastructure (see L</format_record> for details) on
  95 success; returns 0 otherwise.
  96
  97     while (my $rec = $m->parse_record) {
  98       # handle extracted record data
  99     }
 100
 101 =cut
 102
 103 sub parse_record {
 104     my ($self) = @_;
 105
 106     # get the next record and wipe current parsed record
 107     return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
 108     my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
 109     $self->{data}{crec} = { egid => undef, bib  => undef, tags => undef };
 110
 111     my @fields = $record->children;
 112     for my $f (@fields)
 113       { $self->process_field($f) }
 114
 115     # cleanup memory and increment pointer
 116     $record->purge;
 117     $self->{data}{rptr}++;
 118
 119     return $self->{data}{crec};
 120 }
 121
 122 sub process_field {
 123     my ($self, $field) = @_;
 124     my $map = $self->{map};
 125     my $tag = $field->{'att'}->{'tag'};
 126     my $crec = $self->{data}{crec};
 127
 128     # leader
 129     unless (defined $tag) {
 130         #FIXME
 131         return;
 132     }
 133
 134     # datafields
 135     if ($tag == 903) {
 136         my $sub = $field->first_child('subfield');
 137         $crec->{egid} = $sub->text;
 138         return;
 139     }
 140     if ($map->has($tag)) {
 141         push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
 142         my @subs = $field->children('subfield');
 143         for my $sub (@subs)
 144           { $self->process_subs($tag, $sub) }
 145         # check map to ensure all declared subs have a value
 146         my $mods = $map->mods($field);
 147         for my $mappedsub ( @{ $map->subfields($tag) } ) {
 148             next if $mods->{multi};
 149             $crec->{tags}[-1]{uni}{$mappedsub} = ''
 150               unless defined $crec->{tags}[-1]{uni}{$mappedsub};
 151         }
 152     }
 153 }
 154
 155 sub process_subs {
 156     my ($self, $tag, $sub) = @_;
 157     my $map  = $self->{map};
 158     my $code = $sub->{'att'}->{'code'};
 159
 160     # handle unmapped tag/subs
 161     unless ($map->has($tag, $code)) {
 162         my $u = $self->{data}{umap};
 163         my $s = $self->{data}{stag};
 164         return unless (defined $s->{$tag});
 165
 166         # set a value, total-seen count and records-seen-in count
 167         $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
 168         $u->{$tag}{$code}{count}++;
 169         $u->{$tag}{$code}{rcnt}++ unless ($u->{$tag}{$code}{last} == $self->{data}{rptr});
 170         $u->{$tag}{$code}{last} = $self->{data}{rptr};
 171         return;
 172     }
 173
 174     # fetch our datafield struct and fieldname
 175     my $dataf = $self->{data}{crec}{tags}[-1];
 176     my $field = $map->field($tag, $code);
 177
 178     # handle modifiers, or slug data in normally
 179     if (my $mods = $map->mods($field)) {
 180         if ($mods->{multi}) {
 181             my $name = $tag . $code;
 182             push @{$dataf->{multi}{$name}}, $sub->text;
 183         }
 184     } else {
 185         $dataf->{uni}{$code} = $sub->text;
 186     }
 187 }
 188
 189 =head1 PARSED RECORDS
 190
 191 Given:
 192
 193     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS);
 194     $rec = $m->parse_record;
 195
 196 Then C<$rec> will look like:
 197
 198     {
 199       egid   => evergreen_record_id,
 200       bib    => {
 201                   (tag_id . sub_code)1 => value1,
 202                   (tag_id . sub_code)2 => value2,
 203                   ...
 204                 },
 205       tags => [
 206                 {
 207                   tag   => tag_id,
 208                   multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
 209                   uni   => { code => value, code2 => value2, ... },
 210                 },
 211                 ...
 212               ]
 213     }
 214
 215 That is, there is an C<egid> key which points to the Evergreen ID of
 216 that record, a C<bib> key which points to a hashref, and a C<tags>
 217 key which points to an arrayref.
 218
 219 =head3 C<bib>
 220
 221 A reference to a hash which holds extracted data which occurs only
 222 once per record (and is therefore "bib-level"; the default assumption
 223 is that a tag/subfield pair can occur multiple times per record). The
 224 keys are composed of tag id and subfield code, catenated
 225 (e.g. 901c). The values are the contents of that subfield of that tag.
 226
 227 If there are no tags defined as bib-level in the mapfile, C<bib> will
 228 be C<undef>.
 229
 230 =head3 C<tags>
 231
 232 A reference to a list of anonymous hashes, one for each instance of
 233 each tag which occurs in the map.
 234
 235 Each tag hash holds its own id (e.g. C<998>), and two references to
 236 two more hashrefs, C<multi> and C<uni>.
 237
 238 The C<multi> hash holds the extracted data for tag/sub mappings which
 239 have the C<multiple> modifier on them. The keys in C<multi> are
 240 composed of the tag id and subfield code, catenated
 241 (e.g. C<901c>). The values are arrayrefs containing the content of all
 242 instances of that subfield in that instance of that tag. If no tags
 243 are defined as C<multi>, it will be C<undef>.
 244
 245 The C<uni> hash holds data for tag/sub mappings which occur only once
 246 per instance of a tag (but may occur multiple times in a record due to
 247 there being multiple instances of that tag in a record). Keys are
 248 subfield codes and values are subfield content.
 249
 250 All C<uni> subfields occuring in the map are guaranteed to be
 251 defined. Sufields which are mapped but do not occur in a particular
 252 datafield will be given a value of '' (the null string) in the current
 253 record struct. Oppose subfields which are not mapped, which will be
 254 C<undef>.
 255
 256 =head1 UNMAPPED TAGS
 257
 258     { tag_id => {
 259                   sub_code  => { value => VALUE, count => COUNT },
 260                   sub_code2 => { value => VALUE, count => COUNT },
 261                   ...
 262                 },
 263       ...
 264     }
 265
 266 =head1 AUTHOR
 267
 268 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
 269
 270 =head1 BUGS
 271
 272 Please report any bugs or feature requests to the above email address.
 273
 274 =head1 SUPPORT
 275
 276 You can find documentation for this module with the perldoc command.
 277
 278     perldoc Equinox::Migration::MapDrivenMARCXMLProc
 279
 280
 281 =head1 COPYRIGHT & LICENSE
 282
 283 Copyright 2009 Equinox, all rights reserved.
 284
 285 This program is free software; you can redistribute it and/or modify it
 286 under the same terms as Perl itself.
 287
 288
 289 =cut
 290
 291 1; # End of Equinox::Migration::MapDrivenMARCXMLProc