1 package Equinox::Migration::MapDrivenMARCXMLProc;
7 use Equinox::Migration::SubfieldMapper 1.003;
11 # sample functionality should be extracted into a new module which
12 # uses E::M::SM to drive sampling of individual datafields, and
13 # reports ALL datafields which occur
15 # --sample should give the list of all datafields
16 # --samplefile should take a SM map as teh argument and introspect the mapped datafields
21 Equinox::Migration::MapDrivenMARCXMLProc
29 our $VERSION = '1.000';
36 use Equinox::Migration::MapDrivenMARCXMLProc;
44 Takes two required arguments: C<mapfile> (which will be passed along
45 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
46 and C<marcfile> (the MARC data to be processed).
48 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
54 my ($class, %args) = @_;
56 my $self = bless { mods => { multi => {},
60 data => { recs => undef, # X::T record objects
61 rptr => 0, # next record pointer
62 crec => undef, # parsed record storage
66 # initialize map and taglist
67 die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
68 my @mods = keys %{$self->{mods}};
69 $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
71 $self->{data}{tags} = $self->{map}->tags;
74 die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
75 if (-r $args{marcfile}) {
76 $self->{twig} = XML::Twig->new;
77 $self->{twig}->parsefile($args{marcfile});
78 my @records = $self->{twig}->root->children;
79 $self->{data}{recs} = \@records;
81 die "Can't open marc file: $!\n";
90 Extracts data from the next record, per the mapping file. Returns a
91 normalized datastructure (see L</format_record> for details) on
92 success; returns 0 otherwise.
94 while (my $rec = $m->parse_record) {
95 # handle extracted record data
103 # get the next record and wipe current parsed record
104 return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
105 my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
106 $self->{data}{crec} = { egid => undef, bib => undef, tags => undef };
108 my @fields = $record->children;
110 { $self->process_field($f) }
112 # cleanup memory and increment pointer
114 $self->{data}{rptr}++;
116 # check for required fields
117 $self->check_required;
119 return $self->{data}{crec};
123 my ($self, $field) = @_;
124 my $map = $self->{map};
125 my $tag = $field->{'att'}->{'tag'};
126 my $crec = $self->{data}{crec};
129 unless (defined $tag) {
136 my $sub = $field->first_child('subfield');
137 $crec->{egid} = $sub->text;
140 if ($map->has($tag)) {
141 push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
142 my @subs = $field->children('subfield');
144 { $self->process_subs($tag, $sub) }
145 # check map to ensure all declared subs have a value
146 my $mods = $map->mods($field);
147 for my $mappedsub ( @{ $map->subfields($tag) } ) {
148 next if $mods->{multi};
149 $crec->{tags}[-1]{uni}{$mappedsub} = ''
150 unless defined $crec->{tags}[-1]{uni}{$mappedsub};
156 my ($self, $tag, $sub) = @_;
157 my $map = $self->{map};
158 my $code = $sub->{'att'}->{'code'};
160 # handle unmapped tag/subs
161 return unless ($map->has($tag, $code));
163 # fetch our datafield struct and fieldname
164 my $dataf = $self->{data}{crec}{tags}[-1];
165 my $field = $map->field($tag, $code);
168 if (my $mods = $map->mods($field)) {
169 if ($mods->{multi}) {
170 my $name = $tag . $code;
171 push @{$dataf->{multi}{$name}}, $sub->text;
176 die "Multiple occurances of a non-multi field: $tag$code at rec ",
177 ($self->{data}{rptr} + 1),"\n" if (defined $dataf->{uni}{$code});
178 $dataf->{uni}{$code} = $sub->text;
184 my $mods = $self->{map}->mods;
185 my $crec = $self->{data}{crec};
187 for my $tag_id (keys %{$mods->{required}}) {
188 for my $code (@{$mods->{required}{$tag_id}}) {
191 $found = 1 if ($crec->{bib}{($tag_id . $code)});
192 for my $tag (@{$crec->{tags}}) {
193 $found = 1 if ($tag->{multi}{($tag_id . $code)});
194 $found = 1 if ($tag->{uni}{$code});
197 die "Required mapping $tag_id$code not found in rec ",$self->{data}{rptr},"\n"
206 MapDrivenMARCXMLProc implements the following modifiers, and passes
207 them to L<Equinox::Migration::SubfieldMapper>, meaning that specifying
208 any other modifiers in a MDMP map file will cause a fatal error when
213 If a mapping is declared to be C<multi>, then MDMP expects to see more
214 than one instance of that subfield per datafield, and the data is
215 handled accordingly (see L</PARSED RECORDS> below).
217 Occurring zero or one time is legal for a C<multi> mapping.
219 A mapping which is not flagged as C<multi>, but which occurs more than
220 once per datafield will cause a fatal error.
224 The C<bib> modifier declares that a mapping is "bib-level", and should
225 be encountered once per B<record> instead of once per B<datafield> --
226 which is another way of saying that it occurs in a non-repeating
227 datafield or in a controlfield.
231 By default, if a mapping does not occur in a datafield (or record, in
232 the case of C<bib> mappings), processing continues normally. if a
233 mapping has the C<required> modifier, however, it must appear, or a
234 fatal error will occur.
236 =head1 PARSED RECORDS
240 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS);
241 $rec = $m->parse_record;
243 Then C<$rec> will look like:
246 egid => evergreen_record_id,
248 (tag_id . sub_code)1 => value1,
249 (tag_id . sub_code)2 => value2,
255 multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
256 uni => { code => value, code2 => value2, ... },
262 That is, there is an C<egid> key which points to the Evergreen ID of
263 that record, a C<bib> key which points to a hashref, and a C<tags>
264 key which points to an arrayref.
268 A reference to a hash which holds extracted data which occurs only
269 once per record (and is therefore "bib-level"; the default assumption
270 is that a tag/subfield pair can occur multiple times per record). The
271 keys are composed of tag id and subfield code, catenated
272 (e.g. 901c). The values are the contents of that subfield of that tag.
274 If there are no tags defined as bib-level in the mapfile, C<bib> will
279 A reference to a list of anonymous hashes, one for each instance of
280 each tag which occurs in the map.
282 Each tag hash holds its own id (e.g. C<998>), and two references to
283 two more hashrefs, C<multi> and C<uni>.
285 The C<multi> hash holds the extracted data for tag/sub mappings which
286 have the C<multiple> modifier on them. The keys in C<multi> are
287 composed of the tag id and subfield code, catenated
288 (e.g. C<901c>). The values are arrayrefs containing the content of all
289 instances of that subfield in that instance of that tag. If no tags
290 are defined as C<multi>, it will be C<undef>.
292 The C<uni> hash holds data for tag/sub mappings which occur only once
293 per instance of a tag (but may occur multiple times in a record due to
294 there being multiple instances of that tag in a record). Keys are
295 subfield codes and values are subfield content.
297 All C<uni> subfields occuring in the map are guaranteed to be
298 defined. Sufields which are mapped but do not occur in a particular
299 datafield will be given a value of '' (the null string) in the current
300 record struct. Oppose subfields which are not mapped, which will be
306 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
310 Please report any bugs or feature requests to the above email address.
314 You can find documentation for this module with the perldoc command.
316 perldoc Equinox::Migration::MapDrivenMARCXMLProc
319 =head1 COPYRIGHT & LICENSE
321 Copyright 2009 Equinox, all rights reserved.
323 This program is free software; you can redistribute it and/or modify it
324 under the same terms as Perl itself.
329 1; # End of Equinox::Migration::MapDrivenMARCXMLProc