Equinox-Migration/lib/Equinox/Migration/MapDrivenMARCXMLProc.pm

   1 package Equinox::Migration::MapDrivenMARCXMLProc;
   2
   3 use warnings;
   4 use strict;
   5
   6 use XML::Twig;
   7 use Equinox::Migration::SubfieldMapper;
   8
   9 =head1 NAME
  10
  11 Equinox::Migration::MapDrivenMARCXMLProc
  12
  13 =head1 VERSION
  14
  15 Version 1.000
  16
  17 =cut
  18
  19 our $VERSION = '1.000';
  20
  21
  22 =head1 SYNOPSIS
  23
  24 Foo
  25
  26     use Equinox::Migration::MapDrivenMARCXMLProc;
  27
  28
  29 =head1 METHODS
  30
  31
  32 =head2 new
  33
  34 Takes two required arguments: C<mapfile> (which will be passed along
  35 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
  36 and C<marcfile> (the MARC data to be processed).
  37
  38     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  39                                                            marcfile => FILE );
  40
  41 There is an optional third, argument, C<sample>, which specifies a
  42 arrayref of datafields to "sample" by reporting on subfields which are
  43 found in the data but not in the map.
  44
  45     my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile  => FILE,
  46                                                            marcfile => FILE,
  47                                                            sample   => \@TAGS
  48                                                          );
  49
  50 See L</UNMAPPED TAGS> for more info.
  51
  52 =cut
  53
  54 sub new {
  55     my ($class, %args) = @_;
  56
  57     my $self = bless { mods => { multi    => {},
  58                                  once     => {},
  59                                  required => {},
  60                                },
  61                        data => { recs => undef, # X::T record objects
  62                                  rptr => 0,     # next record pointer
  63                                  crec => undef, # parsed record storage
  64                                  stag => undef, # list of tags to sample
  65                                  umap => undef, # unmapped data samples
  66                                },
  67                      }, $class;
  68
  69     # initialize map and taglist
  70     die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
  71     my @mods = keys %{$self->{mods}};
  72     $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
  73                                                             mods => \@mods );
  74     $self->{data}{tags} = $self->{map}->tags;
  75
  76     # initialize twig
  77     die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
  78     if (-r $args{marcfile}) {
  79         $self->{twig} = XML::Twig->new;
  80         $self->{twig}->parsefile($args{marcfile});
  81         my @records = $self->{twig}->root->children;
  82         $self->{data}{recs} = \@records;
  83     } else {
  84         die "Can't open marc file: $!\n";
  85     }
  86
  87     return $self;
  88 }
  89
  90
  91 =head2 parse_record
  92
  93 Extracts data from the next record, per the mapping file. Returns a
  94 normalized datastructure (see L</format_record> for details) on
  95 success; returns 0 otherwise.
  96
  97     while (my $rec = $m->parse_record) {
  98       # handle extracted record data
  99     }
 100
 101 =cut
 102
 103 sub parse_record {
 104     my ($self) = @_;
 105
 106     # get the next record and wipe current parsed record
 107     return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
 108     my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
 109     $self->{data}{crec} = { egid => undef, bib  => undef, tags => undef };
 110
 111     my @fields = $record->children;
 112     for my $f (@fields)
 113       { $self->process_field($f) }
 114
 115     # cleanup memory and increment pointer
 116     $record->purge;
 117     $self->{data}{rptr}++;
 118
 119     return $self->{data}{crec};
 120 }
 121
 122 =head2 process_field
 123
 124 =cut
 125
 126 sub process_field {
 127     my ($self, $field) = @_;
 128     my $map = $self->{map};
 129     my $tag = $field->{'att'}->{'tag'};
 130     my $crec = $self->{data}{crec};
 131
 132     # datafields
 133     if (defined $tag) {
 134         if ($tag == 903) {
 135             my $sub = $field->first_child('subfield');
 136             $crec->{egid} = $sub->text;;
 137         } elsif ($map->has($tag)) {
 138             push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
 139             my @subs = $field->children('subfield');
 140             for my $sub (@subs)
 141               { $self->process_subs($tag, $sub) }
 142             # check map to ensure all declared subs are in
 143         }
 144     }
 145 }
 146
 147 =head2 process_subs
 148
 149 =cut
 150
 151 sub process_subs {
 152     my ($self, $tag, $sub) = @_;
 153     my $map  = $self->{map};
 154     my $code = $sub->{'att'}->{'code'};
 155
 156     # handle unmapped tag/subs
 157     unless ($map->has($tag, $code)) {
 158         my $u = $self->{data}{umap};
 159         my $s = $self->{data}{stag};
 160         return unless (defined $s->{$tag});
 161
 162         # set a value, total-seen count and records-seen-in count
 163         $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
 164         $u->{$tag}{$code}{count}++;
 165         $u->{$tag}{$code}{rcnt}++ unless ($u->{$tag}{$code}{last} == $self->{data}{rptr});
 166         $u->{$tag}{$code}{last} = $self->{data}{rptr};
 167         return;
 168     }
 169
 170     my $dataf = $self->{data}{crec}{tags}[-1];
 171     my $field = $map->field($tag, $code);
 172
 173     # handle modifiers
 174     if (defined $map->mods($field)) {
 175         if ($map->mods($field) eq 'multi') {
 176             my $name = $tag . $code;
 177             push @{$dataf->{multi}{$name}}, $sub->text;
 178         }
 179     }
 180
 181     $dataf->{uni}{$code} = $sub->text;
 182 }
 183
 184 =head1 PARSED RECORDS
 185
 186     {
 187       egid   => evergreen_record_id,
 188       bib    => {
 189                   (tag_id . sub_code)1 => value1,
 190                   (tag_id . sub_code)2 => value2,
 191                   ...
 192                 },
 193       tags => [
 194                 {
 195                   tag   => tag_id,
 196                   multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
 197                   uni   => { code => value, code2 => value2, ... },
 198                 },
 199                 ...
 200               ]
 201     }
 202
 203 That is, there is an C<egid> key which points to the Evergreen ID of
 204 that record, a C<bib> key which points to a hashref, and a C<tags>
 205 key which points to an arrayref.
 206
 207 =head3 C<bib>
 208
 209 This hashref holds extracted data which should occur once per record
 210 (the default assumption is that a tag/subfield pair can occur multiple
 211 times per record). The keys are composed of tag id and subfield code,
 212 catenated (e.g. 901c). The values are the contents of that subfield of
 213 that tag.
 214
 215 If there are no tags defined as bib-level, C<bib> will be C<undef>.
 216
 217 =head3 C<tags>
 218
 219 This arrayref holds anonymous hashrefs, one for each instance of each
 220 tag which occurs in the map. Each tag hashref holds its own id
 221 (e.g. C<998>), and two more hashrefs, C<multi> and C<uni>.
 222
 223 The C<multi> hashref holds the extracted data for tag/sub mappings
 224 which have the C<multiple> modifier on them. The keys in C<multi> are
 225 composed of the tag id and subfield code, catenated
 226 (e.g. C<901c>). The values are arrayrefs containing the content of all
 227 instances of that subfield in that instance of that tag.
 228
 229 The C<uni> hashref holds data for tag/sub mappings which occur only
 230 once per instance of a tag (but may occur multiple times in a record
 231 due to there being multiple instances of that tag in a record). Keys
 232 are subfield codes and values are subfield content.
 233
 234 If no tags are defined as C<multi>, it will be C<undef>.
 235
 236 =head1 UNMAPPED TAGS
 237
 238     { tag_id => {
 239                   sub_code  => { value => VALUE, count => COUNT },
 240                   sub_code2 => { value => VALUE, count => COUNT },
 241                   ...
 242                 },
 243       ...
 244     }
 245
 246 =head1 AUTHOR
 247
 248 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
 249
 250 =head1 BUGS
 251
 252 Please report any bugs or feature requests to the above email address.
 253
 254 =head1 SUPPORT
 255
 256 You can find documentation for this module with the perldoc command.
 257
 258     perldoc Equinox::Migration::MapDrivenMARCXMLProc
 259
 260
 261 =head1 COPYRIGHT & LICENSE
 262
 263 Copyright 2009 Equinox, all rights reserved.
 264
 265 This program is free software; you can redistribute it and/or modify it
 266 under the same terms as Perl itself.
 267
 268
 269 =cut
 270
 271 1; # End of Equinox::Migration::MapDrivenMARCXMLProc