Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm

   1 package Equinox::Migration::MapDrivenMARCXMLProc;
   2
   3 use warnings;
   4 use strict;
   5
   6 use XML::Twig;
   7 use Equinox::Migration::SubfieldMapper;
   8
   9 =head1 NAME
  10
  11 Equinox::Migration::MapDrivenMARCXMLProc
  12
  13 =head1 VERSION
  14
  15 Version 1.000
  16
  17 =cut
  18
  19 our $VERSION = '1.000';
  20
  21
  22 =head1 SYNOPSIS
  23
  24 Foo
  25
  26     use Equinox::Migration::MapDrivenMARCXMLProc;
  27
  28
  29 =head1 METHODS
  30
  31
  32 =head2 new
  33
  34 Takes two required arguments: C<mapfile> (which will be passed along
  35 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
  36 and C<marcfile> (the MARC data to be processed).
  37
  38     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  39                                                            marcfile => FILE );
  40
  41 There is an optional third, argument, C<sample>, which specifies a
  42 arrayref of datafields to "sample" by reporting on subfields which are
  43 found in the data but not in the map.
  44
  45     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  46                                                            marcfile => FILE,
  47                                                            sample   => \@TAGS
  48                                                          );
  49
  50 See L</UNMAPPED TAGS> for more info.
  51
  52 =cut
  53
  54 sub new {
  55     my ($class, %args) = @_;
  56
  57     my $self = bless { mods => { multi    => {},
  58                                  once     => {},
  59                                  required => {},
  60                                },
  61                        data => { recs => undef, # X::T record objects
  62                                  rptr => 0,     # next record pointer
  63                                  crec => undef, # parsed record storage
  64                                  stag => undef, # list of tags to sample
  65                                  umap => undef, # unmapped data samples
  66                                },
  67                      }, $class;
  68
  69     # initialize map and taglist
  70     die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
  71     my @mods = keys %{$self->{mods}};
  72     $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
  73                                                             mods => \@mods );
  74     $self->{data}{tags} = $self->{map}->tags;
  75
  76     # initialize twig
  77     die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
  78     if (-r $args{marcfile}) {
  79         $self->{twig} = XML::Twig->new;
  80         $self->{twig}->parsefile($args{marcfile});
  81         my @records = $self->{twig}->root->children;
  82         $self->{data}{recs} = \@records;
  83     } else {
  84         die "Can't open marc file: $!\n";
  85     }
  86
  87     return $self;
  88 }
  89
  90
  91 =head2 parse_record
  92
  93 Extracts data from the next record, per the mapping file. Returns a
  94 normalized datastructure (see L</format_record> for details) on
  95 success; returns 0 otherwise.
  96
  97     while (my $rec = $m->parse_record) {
  98       # handle extracted record data
  99     }
 100
 101 =cut
 102
 103 sub parse_record {
 104     my ($self) = @_;
 105
 106     # get the next record and wipe current parsed record
 107     return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
 108     my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
 109     $self->{data}{crec} = { bib => undef, multi => undef };
 110
 111     my @fields = $record->children;
 112     for my $f (@fields)
 113       { $self->process_field($f) }
 114
 115     # cleanup memory and increment pointer
 116     $record->purge;
 117     $self->{data}{rptr}++;
 118
 119     return $self->format_record;
 120 }
 121
 122 =head2 process_field
 123
 124 =cut
 125
 126 sub process_field {
 127     my ($self, $field) = @_;
 128     my $map = $self->{map};
 129     my $tag = $field->{'att'}->{'tag'};
 130     my $parsed = $self->{data}{crec};
 131
 132     # datafields
 133     if (defined $tag) {
 134         if ($tag == 903) {
 135             my $sub = $field->first_child('subfield');
 136             $parsed->{egid} = $sub->text;;
 137         } elsif ($map->has($tag)) {
 138             push @{$parsed->{tags}}, { tag => $tag };
 139             my @subs = $field->children('subfield');
 140             for my $sub (@subs)
 141               { $self->process_subs($tag, $sub) }
 142         }
 143     }
 144 }
 145
 146 =head2 process_subs
 147
 148 =cut
 149
 150 sub process_subs {
 151     my ($self, $tag, $sub) = @_;
 152     my $map  = $self->{map};
 153     my $code = $sub->{'att'}->{'code'};
 154
 155     # handle unmapped tag/subs
 156     unless ($map->has($tag, $code)) {
 157         my $u = $self->{data}{umap};
 158         my $s = $self->{data}{stag};
 159         return unless (defined $s->{$tag});
 160
 161         $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
 162         $u->{$tag}{$code}{count}++;
 163         return;
 164     }
 165
 166     my $data = $self->{data}{crec}{tags}[-1];
 167     my $field = $map->field($tag, $code);
 168
 169     # handle modifiers
 170     if (defined $map->mods($field)) {
 171         if ($map->mods($field) eq 'multi') {
 172             my $name = $tag . $code;
 173             push @{$data->{multi}{$name}}, $sub->text;
 174         }
 175     }
 176
 177     $data->{uni}{$code} = $sub->text;
 178 }
 179
 180 =head1 PARSED RECORDS
 181
 182     {
 183       egid   => evergreen_record_id,
 184       bib    => {
 185                   (tag_id . sub_code)1 => value1,
 186                   (tag_id . sub_code)2 => value2,
 187                   ...
 188                 },
 189       tags => [
 190                 {
 191                   tag   => tag_id,
 192                   multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
 193                   uni   => { code => value, code2 => value2, ... },
 194                 },
 195                 ...
 196               ]
 197     }
 198
 199 That is, there is an C<egid> key which points to the Evergreen ID of
 200 that record, a C<bib> key which points to a hashref, and a C<tags>
 201 key which points to an arrayref.
 202
 203 =head3 C<bib>
 204
 205 This hashref holds extracted data which should occur once per record
 206 (the default assumption is that a tag/subfield pair can occur multiple
 207 times per record). The keys are composed of tag id and subfield code,
 208 catenated (e.g. 901c). The values are the contents of that subfield of
 209 that tag.
 210
 211 If there are no tags defined as bib-level, C<bib> will be C<undef>.
 212
 213 =head3 C<tags>
 214
 215 This arrayref holds anonymous hashrefs, one for each instance of each
 216 tag which occurs in the map. Each tag hashref holds its own id
 217 (e.g. C<998>), and two more hashrefs, C<multi> and C<uni>.
 218
 219 The C<multi> hashref holds the extracted data for tag/sub mappings
 220 which have the C<multiple> modifier on them. The keys in C<multi> are
 221 composed of the tag id and subfield code, catenated
 222 (e.g. C<901c>). The values are arrayrefs containing the content of all
 223 instances of that subfield in that instance of that tag.
 224
 225 The C<uni> hashref holds data for tag/sub mappings which occur only
 226 once per instance of a tag (but may occur multiple times in a record
 227 due to there being multiple instances of that tag in a record). Keys
 228 are subfield codes and values are subfield content.
 229
 230 If no tags are defined as C<multi>, it will be C<undef>.
 231
 232 =head1 UNMAPPED TAGS
 233
 234     { tag_id => {
 235                   sub_code  => { value => VALUE, count => COUNT },
 236                   sub_code2 => { value => VALUE, count => COUNT },
 237                   ...
 238                 },
 239       ...
 240     }
 241
 242 =head1 AUTHOR
 243
 244 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
 245
 246 =head1 BUGS
 247
 248 Please report any bugs or feature requests to the above email address.
 249
 250 =head1 SUPPORT
 251
 252 You can find documentation for this module with the perldoc command.
 253
 254     perldoc Equinox::Migration::MapDrivenMARCXMLProc
 255
 256
 257 =head1 COPYRIGHT & LICENSE
 258
 259 Copyright 2009 Equinox, all rights reserved.
 260
 261 This program is free software; you can redistribute it and/or modify it
 262 under the same terms as Perl itself.
 263
 264
 265 =cut
 266
 267 1; # End of Equinox::Migration::MapDrivenMARCXMLProc