Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm

   1 package Equinox::Migration::MapDrivenMARCXMLProc;
   2
   3 use warnings;
   4 use strict;
   5
   6 use XML::Twig;
   7 use Equinox::Migration::SubfieldMapper;
   8
   9 =head1 NAME
  10
  11 Equinox::Migration::MapDrivenMARCXMLProc
  12
  13 =head1 VERSION
  14
  15 Version 1.000
  16
  17 =cut
  18
  19 our $VERSION = '1.000';
  20
  21
  22 =head1 SYNOPSIS
  23
  24 Foo
  25
  26     use Equinox::Migration::MapDrivenMARCXMLProc;
  27
  28
  29 =head1 METHODS
  30
  31
  32 =head2 new
  33
  34 Takes two required arguments: C<mapfile> (which will be passed along
  35 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
  36 and C<marcfile> (the MARC data to be processed).
  37
  38     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  39                                                            marcfile => FILE );
  40
  41 There is an optional third, argument, C<sample>, which specifies a
  42 arrayref of datafields to "sample" by reporting on subfields which are
  43 found in the data but not in the map.
  44
  45     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  46                                                            marcfile => FILE,
  47                                                            sample   => \@TAGS
  48                                                          );
  49
  50 See L</UNMAPPED TAGS> for more info.
  51
  52 =cut
  53
  54 sub new {
  55     my ($class, %args) = @_;
  56
  57     my $self = bless { mods => { multi    => {},
  58                                  once     => {},
  59                                  required => {},
  60                                },
  61                        data => { recs => undef, # X::T record objects
  62                                  rptr => 0,     # next record pointer
  63                                  crec => undef, # parsed record storage
  64                                  stag => undef, # list of tags to sample
  65                                  umap => undef, # unmapped data samples
  66                                },
  67                      }, $class;
  68
  69     # initialize map and taglist
  70     my @mods = keys %{$self->{mods}};
  71     $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
  72                                                             mods => \@mods );
  73     $self->{tags} = $self->{map}->tags;
  74
  75     # initialize twig
  76     die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
  77     if (-r $args{marcfile}) {
  78         $self->{twig} = XML::Twig->new;
  79         $self->{twig}->parsefile($args{marcfile});
  80         my @records = $self->{twig}->root->children;
  81         $self->{data}{recs} = \@records;
  82     } else {
  83         die "Can't open marc file: $!\n";
  84     }
  85
  86     return $self;
  87 }
  88
  89
  90 =head2 parse_record
  91
  92 Extracts data from the next record, per the mapping file. Returns 1 on
  93 success, 0 otherwise.
  94
  95     while ($m->parse_record) {
  96       # handle extracted record data
  97     }
  98
  99 =cut
 100
 101 sub parse_record {
 102     my ($self) = @_;
 103
 104     # get the next record and wipe current parsed record
 105     return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
 106     my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
 107     $self->{data}{crec} = {};
 108
 109     my @fields = $record->children;
 110     for my $f (@fields)
 111       { $self->process_field($f) }
 112
 113     # cleanup memory and increment pointer
 114     $record->purge;
 115     $self->{data}{rptr}++;
 116 }
 117
 118 =head2 process_field
 119
 120 =cut
 121
 122 sub process_field {
 123     my ($self, $field) = @_;
 124     my $map = $self->{map};
 125     my $tag = $field->{'att'}->{'tag'};
 126     my $parsed = $self->{data}{crec};
 127
 128     if ($tag == 903) {
 129         my $sub = $field->first_child('subfield');
 130         $parsed->{egid} = $sub->text;;
 131     } elsif ($map->has($tag)) {
 132         push @{$parsed->{tags}}, { tag => $tag };
 133         my @subs = $field->children('subfield');
 134         for my $sub (@subs)
 135           { $self->process_subs($tag, $sub) }
 136     }
 137 }
 138
 139 =head2 process_subs
 140
 141 =cut
 142
 143 sub process_subs {
 144     my ($self, $tag, $sub) = @_;
 145     my $map  = $self->{map};
 146     my $code = $sub->{'att'}->{'code'};
 147
 148     # handle unmapped tag/subs
 149     unless ($map->has($tag, $code)) {
 150         my $u = $self->{data}{umap};
 151         my $s = $self->{data}{stag};
 152         return unless (defined $s->{$tag});
 153
 154         $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
 155         $u->{$tag}{$code}{count}++;
 156         return;
 157     }
 158
 159     my $data = $self->{data}{crec}{tags}[-1];
 160     my $field = $map->field($tag, $code);
 161     if ($map->mod($field) eq 'multi') {
 162         my $name = $tag . $code;
 163         push @{$data->{multi}{$name}}, $sub->text;
 164     } else {
 165         $data->{uni}{$code} = $sub->text;
 166     }
 167 }
 168
 169 =head1 PARSED RECORDS
 170
 171     {
 172       egid   => evergreen_record_id,
 173       bib    => {
 174                   (tag_id . sub_code)1 => value1,
 175                   (tag_id . sub_code)2 => value2,
 176                   ...
 177                 },
 178       tags => [
 179                 {
 180                   tag   => tag_id,
 181                   multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
 182                   uni   => { code => value, code2 => value2, ... },
 183                 },
 184                 ...
 185               ]
 186     }
 187
 188 That is, there is an C<egid> key which points to the Evergreen ID of
 189 that record, a C<bib> key which points to a hashref, and a C<tags>
 190 key which points to an arrayref.
 191
 192 =head3 C<bib>
 193
 194 This hashref holds extracted data which should occur once per record
 195 (the default assumption is that a tag/subfield pair can occur multiple
 196 times per record). The keys are composed of tag id and subfield code,
 197 catenated (e.g. 901c). The values are the contents of that subfield of
 198 that tag.
 199
 200 =head3 C<tags>
 201
 202 This arrayref holds anonymous hashrefs, one for each instance of each
 203 tag which occurs in the map. Each tag hashref holds its own id
 204 (e.g. C<998>), and two more hashrefs, C<multi> and C<uni>.
 205
 206 The C<multi> hashref holds the extracted data for tag/sub mappings
 207 which have the C<multiple> modifier on them. The keys in C<multi> are
 208 composed of the tag id and subfield code, catenated
 209 (e.g. C<901c>). The values are arrayrefs containing the content of all
 210 instances of that subfield in that instance of that tag.
 211
 212 The C<uni> hashref holds data for tag/sub mappings which occur only
 213 once per instance of a tag (but may occur multiple times in a record
 214 due to there being multiple instances of that tag in a record). Keys
 215 are subfield codes and values are subfield content.
 216
 217 =head1 UNMAPPED TAGS
 218
 219     { tag_id => {
 220                   sub_code  => { value => VALUE, count => COUNT },
 221                   sub_code2 => { value => VALUE, count => COUNT },
 222                   ...
 223                 },
 224       ...
 225     }
 226
 227 =head1 AUTHOR
 228
 229 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
 230
 231 =head1 BUGS
 232
 233 Please report any bugs or feature requests to the above email address.
 234
 235 =head1 SUPPORT
 236
 237 You can find documentation for this module with the perldoc command.
 238
 239     perldoc Equinox::Migration::MapDrivenMARCXMLProc
 240
 241
 242 =head1 COPYRIGHT & LICENSE
 243
 244 Copyright 2009 Equinox, all rights reserved.
 245
 246 This program is free software; you can redistribute it and/or modify it
 247 under the same terms as Perl itself.
 248
 249
 250 =cut
 251
 252 1; # End of Equinox::Migration::MapDrivenMARCXMLProc