1 package Equinox::Migration::MapDrivenMARCXMLProc;
8 use Equinox::Migration::SubfieldMapper 1.004;
13 Equinox::Migration::MapDrivenMARCXMLProc
21 our $VERSION = '1.002';
25 my @mods = qw( multi bib required );
34 use Equinox::Migration::MapDrivenMARCXMLProc;
42 Takes two required arguments: C<mapfile> (which will be passed along
43 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
44 and C<marcfile> (the MARC data to be processed).
46 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
52 my ($class, %args) = @_;
54 $verbose = 1 if $args{verbose};
59 # initialize map and taglist
60 die "Argument 'mapfile' must be specified\n" unless ($args{mapfile});
61 $sfmap = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
64 # initialize datastore
65 die "Datastore file 'EMMXSSTORAGE.dbmd' already exists. Exiting.\n"
66 if (-e "EMMXSSTORAGE.dbmd");
67 $dstore = DBM::Deep->new( file => "EMMXSSTORAGE.dbmd",
68 data_sector_size => 256 );
69 $reccount = 0; # next record ptr
70 $dstore->{tags} = $sfmap->tags; # list of all tags
71 $self->{data} = $dstore;
74 die "Argument 'marcfile' must be specified\n" unless ($args{marcfile});
75 if (-r $args{marcfile}) {
76 my $xmltwig = XML::Twig->new( twig_handlers => { record => \&parse_record } );
77 $xmltwig->parsefile( $args{marcfile} );
79 die "Can't open marc file: $!\n";
85 sub DESTROY { unlink "EMMXSSTORAGE.dbmd" }
89 Extracts data from the next record, per the mapping file.
94 my ($twig, $record) = @_;
95 my $crec = {}; # current record
97 my @fields = $record->children;
99 { process_field($f, $crec) }
101 # cleanup memory and increment pointer
105 # check for required fields
107 push @{ $dstore->{recs} }, $crec;
109 print STDERR "$reccount\n"
110 if ($verbose and !($reccount % 1000));
114 my ($field, $crec) = @_;
115 my $tag = $field->{'att'}->{'tag'};
118 unless (defined $tag) {
125 my $sub = $field->first_child('subfield');
126 $crec->{egid} = $sub->text;
129 if ($sfmap->has($tag)) {
130 push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
131 push @{$crec->{tmap}{$tag}}, (@{$crec->{tags}} - 1);
132 my @subs = $field->children('subfield');
134 { process_subs($tag, $sub, $crec) }
136 # check map to ensure all declared tags and subs have a value
137 my $mods = $sfmap->mods($field);
138 for my $mappedsub ( @{ $sfmap->subfields($tag) } ) {
139 next if $mods->{multi};
140 $crec->{tags}[-1]{uni}{$mappedsub} = ''
141 unless defined $crec->{tags}[-1]{uni}{$mappedsub};
143 for my $mappedtag ( @{ $sfmap->tags }) {
144 $crec->{tmap}{$mappedtag} = undef
145 unless defined $crec->{tmap}{$mappedtag};
151 my ($tag, $sub, $crec) = @_;
152 my $code = $sub->{'att'}->{'code'};
154 # handle unmapped tag/subs
155 return unless ($sfmap->has($tag, $code));
157 # fetch our datafield struct and fieldname
158 my $dataf = $crec->{tags}[-1];
159 my $field = $sfmap->field($tag, $code);
160 $crec->{names}{$tag}{$code} = $field;
163 for my $filter ( @{$sfmap->filters($field)} ) {
164 return if ($sub->text =~ /$filter/i);
166 # handle multi modifier
167 if (my $mods = $sfmap->mods($field)) {
168 if ($mods->{multi}) {
169 push @{$dataf->{multi}{$code}}, $sub->text;
174 # if this were a multi field, it would be handled already. make sure its a singleton
175 die "Multiple occurances of a non-multi field: $tag$code at rec ",
176 ($reccount + 1),"\n" if (defined $dataf->{uni}{$code});
178 # everything seems okay
179 $dataf->{uni}{$code} = $sub->text;
184 my $mods = $sfmap->mods;
185 my $crec = $dstore->{crec};
187 for my $tag_id (keys %{$mods->{required}}) {
188 for my $code (@{$mods->{required}{$tag_id}}) {
191 for my $tag (@{$crec->{tags}}) {
192 $found = 1 if ($tag->{multi}{($tag_id . $code)});
193 $found = 1 if ($tag->{uni}{$code});
196 die "Required mapping $tag_id$code not found in rec ",$reccount,"\n"
205 Returns current record number (starting from zero)
209 sub recno { my ($self) = @_; return $self->{data}{rcnt} }
213 Returns mapped fieldname when passed a record number, tag, and code
215 my $name = $m->name(3,999,'a');
219 sub name { my ($self, $r, $t, $c) = @_; return $dstore->{recs}[$r]{names}{$t}{$c} };
223 MapDrivenMARCXMLProc implements the following modifiers, and passes
224 them to L<Equinox::Migration::SubfieldMapper>, meaning that specifying
225 any other modifiers in a MDMP map file will cause a fatal error when
230 If a mapping is declared to be C<multi>, then MDMP expects to see more
231 than one instance of that subfield per datafield, and the data is
232 handled accordingly (see L</PARSED RECORDS> below).
234 Occurring zero or one time is legal for a C<multi> mapping.
236 A mapping which is not flagged as C<multi>, but which occurs more than
237 once per datafield will cause a fatal error.
241 By default, if a mapping does not occur in a datafield, processing
242 continues normally. if a mapping has the C<required> modifier,
243 however, it must appear, or a fatal error will occur.
245 =head1 PARSED RECORDS
249 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS);
250 $rec = $m->parse_record;
252 Then C<$rec> will look like:
255 egid => evergreen_record_id,
259 multi => { code => [ val1, val2, ... ] },
260 uni => { code => value, code2 => value2, ... },
264 tmap => { tag_id => [ INDEX_LIST ], tag_id2 => [ INDEX_LIST ], ... }
267 That is, there is an C<egid> key which points to the Evergreen ID of
268 that record, a C<tags> key which points to an arrayref, and a C<tmap>
269 key which points to a hashref.
273 A reference to a list of anonymous hashes, one for each instance of
274 each tag which occurs in the map.
276 Each tag hash holds its own id (e.g. C<998>), and two references to
277 two more hashrefs, C<multi> and C<uni>.
279 The C<multi> hash holds the extracted data for tag/sub mappings which
280 have the C<multiple> modifier on them. The keys in C<multi> subfield
281 codes. The values are arrayrefs containing the content of all
282 instances of that subfield in that instance of that tag. If no tags
283 are defined as C<multi>, it will be C<undef>.
285 The C<uni> hash holds data for tag/sub mappings which occur only once
286 per instance of a tag (but may occur multiple times in a record due to
287 there being multiple instances of that tag in a record). Keys are
288 subfield codes and values are subfield content.
290 All C<uni> subfields occuring in the map are guaranteed to be
291 defined. Sufields which are mapped but do not occur in a particular
292 datafield will be given a value of '' (the null string) in the current
293 record struct. Oppose subfields which are not mapped, which will be
298 A hashref, where each key (a tag id like "650") points to a listref
299 containing the index (or indices) of C<tags> where that tag has
302 The intended use of this is to simplify the processing of data from
303 tags which can appear more than once in a MARC record, like
304 holdings. If your holdings data is in 852, C<tmap->{852}> will be a
305 listref with the indices of C<tags> which hold the data from the 852
308 Complimentarily, C<tmap> prevents data from singular datafields from
309 having to be copied for every instance of a multiple datafield, as it
310 lets you get the data from that record's one instance of whichever
311 field you're looking for.
315 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
319 Please report any bugs or feature requests to the above email address.
323 You can find documentation for this module with the perldoc command.
325 perldoc Equinox::Migration::MapDrivenMARCXMLProc
328 =head1 COPYRIGHT & LICENSE
330 Copyright 2009 Equinox, all rights reserved.
332 This program is free software; you can redistribute it and/or modify it
333 under the same terms as Perl itself.
338 1; # End of Equinox::Migration::MapDrivenMARCXMLProc