1 package Equinox::Migration::MapDrivenMARCXMLProc;
3 # Copyright 2009-2012, Equinox Software, Inc.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
23 use Equinox::Migration::SubfieldMapper 1.004;
28 Equinox::Migration::MapDrivenMARCXMLProc
36 our $VERSION = '1.005';
40 my @modlist = qw( multi ignoremulti required first concatenate parallel );
43 my $parallel_fields = {};
52 use Equinox::Migration::MapDrivenMARCXMLProc;
60 Takes two required arguments: C<mapfile> (which will be passed along
61 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
62 and C<marcfile> (the MARC data to be processed).
64 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
70 my ($class, %args) = @_;
72 $verbose = 1 if $args{verbose};
76 parallel_fields => \$parallel_fields,
79 # initialize map and taglist
80 die "Argument 'mapfile' must be specified\n" unless ($args{mapfile});
81 $sfmap = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
84 # initialize datastore
86 $reccount = 0; # next record ptr
87 $dstore->{tags} = $sfmap->tags; # list of all tags
88 $self->{data} = $dstore;
91 die "Argument 'marcfile' must be specified\n" unless ($args{marcfile});
92 if (-r $args{marcfile}) {
93 my $xmltwig = XML::Twig->new( twig_handlers => { record => \&parse_record } );
94 $xmltwig->parsefile( $args{marcfile} );
96 die "Can't open marc file: $!\n";
104 Extracts data from the next record, per the mapping file.
109 my ($twig, $record) = @_;
110 my $crec = {}; # current record
112 my @fields = $record->children;
114 { process_field($f, $crec) }
116 # fill in blank values if needed
117 for my $mappedtag ( @{ $sfmap->tags }) {
118 unless (exists $crec->{tmap}{$mappedtag}) {
119 push @{ $crec->{tags} }, {};
120 for my $mappedsub ( @{ $sfmap->subfields($mappedtag) } ) {
121 my $fieldname = $sfmap->field($mappedtag, $mappedsub);
122 my $mods = $sfmap->mods($fieldname);
123 next if $mods->{multi};
124 if ($mods->{parallel}) {
125 push @{ $crec->{tags}[-1]{parallel}{$mappedsub} }, '';
126 $crec->{tags}[-1]{uni} = undef;
128 $crec->{tags}[-1]{uni}{$mappedsub} = '';
129 $crec->{tags}[-1]{parallel} = undef;
131 $crec->{tags}[-1]{multi} = undef;
132 $crec->{tags}[-1]{tag} = $mappedtag;
134 push @{ $crec->{tmap}{$mappedtag} }, $#{ $crec->{tags} };
138 # cleanup memory and increment pointer
142 # check for required fields
143 check_required($crec);
144 push @{ $dstore->{recs} }, $crec;
146 print STDERR "$reccount\n"
147 if ($verbose and !($reccount % 1000));
151 my ($field, $crec) = @_;
152 my $tag = $field->{'att'}->{'tag'};
155 unless (defined $tag) {
162 my $sub = $field->first_child('subfield');
163 $crec->{egid} = $sub->text;
166 if ($sfmap->has($tag)) {
167 push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef, parallel => undef };
168 push @{$crec->{tmap}{$tag}}, (@{$crec->{tags}} - 1);
169 my @subs = $field->children('subfield');
171 { process_subs($tag, $sub, $crec) }
173 # check map to ensure all declared tags and subs have a value
174 for my $mappedsub ( @{ $sfmap->subfields($tag) } ) {
175 my $fieldname = $sfmap->field($tag, $mappedsub);
176 my $mods = $sfmap->mods($fieldname);
177 next if $mods->{multi};
178 if ($mods->{parallel}) {
179 push @{ $crec->{tags}[-1]{parallel}{$mappedsub} }, ''
180 unless defined $crec->{tags}[-1]{parallel}{$mappedsub};
182 $crec->{tags}[-1]{uni}{$mappedsub} = ''
183 unless defined $crec->{tags}[-1]{uni}{$mappedsub};
190 my ($tag, $sub, $crec) = @_;
191 my $code = $sub->{'att'}->{'code'};
193 # handle unmapped tag/subs
194 return unless ($sfmap->has($tag, $code));
196 # fetch our datafield struct and field and mods
197 my $dataf = $crec->{tags}[-1];
198 my $field = $sfmap->field($tag, $code);
199 my $sep = $sfmap->sep($field);
200 $allmods{$field} = $sfmap->mods($field) unless $allmods{$field};
201 my $mods = $allmods{$field};
204 for my $filter ( @{$sfmap->filters($field)} ) {
205 return if ($sub->text =~ /$filter/i);
208 if ($mods->{parallel}) {
209 $parallel_fields->{$tag}{$code} = 1;
210 push @{$dataf->{parallel}{$code}}, $sub->text;
214 # handle multi modifier
215 if ($mods->{multi}) {
216 $multis->{$tag}{$code} = 1;
217 if ($mods->{concatenate}) {
218 if (exists($dataf->{multi}{$code})) {
219 $dataf->{multi}{$code}[0] .= $sep . $sub->text;
221 push @{$dataf->{multi}{$code}}, $sub->text;
223 $multis->{$tag}{$code} = 1;
225 push @{$dataf->{multi}{$code}}, $sub->text;
231 if ($mods->{concatenate}) {
232 if (exists($dataf->{uni}{$code})) {
233 $dataf->{uni}{$code} .= $sep . $sub->text;
235 $dataf->{uni}{$code} = $sub->text;
240 # if this were a multi field, it would be handled already. make sure its a singleton
241 die "Multiple occurances of a non-multi field: $tag$code at rec ",
242 ($reccount + 1),"\n" if (defined $dataf->{uni}{$code} and !$mods->{ignoremulti});
244 # everything seems okay
245 $dataf->{uni}{$code} = $sub->text;
251 my $mods = $sfmap->mods;
253 for my $tag_id (keys %{$mods->{required}}) {
254 for my $code (@{$mods->{required}{$tag_id}}) {
257 for my $tag (@{$crec->{tags}}) {
258 $found = 1 if ($tag->{multi}{($tag_id . $code)});
259 $found = 1 if ($tag->{uni}{$code});
262 die "Required mapping $tag_id$code not found in rec ",$reccount,"\n"
271 Returns current record number (starting from zero)
275 sub recno { my ($self) = @_; return $self->{data}{rcnt} }
279 Returns mapped fieldname when passed a tag, and code
281 my $name = $m->name(999,'a');
285 sub name { my ($self, $t, $c) = @_; return $sfmap->field($t, $c) }
289 Returns whether mapped fieldname is to be applied only to first
295 my ($self, $t, $c) = @_;
296 my $field = $sfmap->field($t, $c);
297 my $mods = $sfmap->mods($field);
298 return exists($mods->{first});
303 Returns hashref of C<{tag}{code}> for all mapped multi fields
312 =head2 get_parallel_fields
314 Returns hashref of C<{tag}{code}> for all mapped parallel fields
318 sub get_parallel_fields {
320 return $parallel_fields;
325 MapDrivenMARCXMLProc implements the following modifiers, and passes
326 them to L<Equinox::Migration::SubfieldMapper>, meaning that specifying
327 any other modifiers in a MDMP map file will cause a fatal error when
332 If a mapping is declared to be C<multi>, then MDMP expects to see more
333 than one instance of that subfield per datafield, and the data is
334 handled accordingly (see L</PARSED RECORDS> below).
336 Occurring zero or one time is legal for a C<multi> mapping.
338 A mapping which is not flagged as C<multi>, but which occurs more than
339 once per datafield will cause a fatal error.
343 By default, if a mapping does not occur in a datafield, processing
344 continues normally. if a mapping has the C<required> modifier,
345 however, it must appear, or a fatal error will occur.
347 =head1 PARSED RECORDS
351 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS);
352 $rec = $m->parse_record;
354 Then C<$rec> will look like:
357 egid => evergreen_record_id,
361 multi => { code => [ val1, val2, ... ] },
362 uni => { code => value, code2 => value2, ... },
366 tmap => { tag_id => [ INDEX_LIST ], tag_id2 => [ INDEX_LIST ], ... }
369 That is, there is an C<egid> key which points to the Evergreen ID of
370 that record, a C<tags> key which points to an arrayref, and a C<tmap>
371 key which points to a hashref.
375 A reference to a list of anonymous hashes, one for each instance of
376 each tag which occurs in the map.
378 Each tag hash holds its own id (e.g. C<998>), and two references to
379 two more hashrefs, C<multi> and C<uni>.
381 The C<multi> hash holds the extracted data for tag/sub mappings which
382 have the C<multiple> modifier on them. The keys in C<multi> subfield
383 codes. The values are arrayrefs containing the content of all
384 instances of that subfield in that instance of that tag. If no tags
385 are defined as C<multi>, it will be C<undef>.
387 The C<uni> hash holds data for tag/sub mappings which occur only once
388 per instance of a tag (but may occur multiple times in a record due to
389 there being multiple instances of that tag in a record). Keys are
390 subfield codes and values are subfield content.
392 All C<uni> subfields occuring in the map are guaranteed to be
393 defined. Sufields which are mapped but do not occur in a particular
394 datafield will be given a value of '' (the null string) in the current
395 record struct. Oppose subfields which are not mapped, which will be
400 A hashref, where each key (a tag id like "650") points to a listref
401 containing the index (or indices) of C<tags> where that tag has
404 The intended use of this is to simplify the processing of data from
405 tags which can appear more than once in a MARC record, like
406 holdings. If your holdings data is in 852, C<tmap->{852}> will be a
407 listref with the indices of C<tags> which hold the data from the 852
410 Complimentarily, C<tmap> prevents data from singular datafields from
411 having to be copied for every instance of a multiple datafield, as it
412 lets you get the data from that record's one instance of whichever
413 field you're looking for.
417 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
421 Please report any bugs or feature requests to the above email address.
425 You can find documentation for this module with the perldoc command.
427 perldoc Equinox::Migration::MapDrivenMARCXMLProc
430 =head1 COPYRIGHT & LICENSE
432 Copyright 2009 Equinox, all rights reserved.
434 This program is free software; you can redistribute it and/or modify it
435 under the same terms as Perl itself.
440 1; # End of Equinox::Migration::MapDrivenMARCXMLProc