1 package Equinox::Migration::MapDrivenMARCXMLProc;
7 use Equinox::Migration::SubfieldMapper 1.003;
11 # sample functionality should be extracted into a new module which
12 # uses E::M::SM to drive sampling of individual datafields, and
13 # reports ALL datafields which occur
15 # --sample should give the list of all datafields
16 # --samplefile should take a SM map as teh argument and introspect the mapped datafields
21 Equinox::Migration::MapDrivenMARCXMLProc
29 our $VERSION = '1.000';
36 use Equinox::Migration::MapDrivenMARCXMLProc;
44 Takes two required arguments: C<mapfile> (which will be passed along
45 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
46 and C<marcfile> (the MARC data to be processed).
48 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
51 There is an optional third, argument, C<sample>, which specifies a
52 arrayref of datafields to "sample" by reporting on subfields which are
53 found in the data but not in the map.
55 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
60 See L</UNMAPPED TAGS> for more info.
65 my ($class, %args) = @_;
67 my $self = bless { mods => { multi => {},
71 data => { recs => undef, # X::T record objects
72 rptr => 0, # next record pointer
73 crec => undef, # parsed record storage
74 stag => undef, # list of tags to sample
75 umap => undef, # unmapped data samples
79 # initialize map and taglist
80 die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
81 my @mods = keys %{$self->{mods}};
82 $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
84 $self->{data}{tags} = $self->{map}->tags;
87 die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
88 if (-r $args{marcfile}) {
89 $self->{twig} = XML::Twig->new;
90 $self->{twig}->parsefile($args{marcfile});
91 my @records = $self->{twig}->root->children;
92 $self->{data}{recs} = \@records;
94 die "Can't open marc file: $!\n";
97 # if we have a sample arg, set up the sample set and umap hash
98 if (defined $args{sample}) {
99 for my $s ( @{$args{sample}})
100 { $self->{data}{stag}{$s} = 1 }
101 $self->{data}{umap} = {};
110 Extracts data from the next record, per the mapping file. Returns a
111 normalized datastructure (see L</format_record> for details) on
112 success; returns 0 otherwise.
114 while (my $rec = $m->parse_record) {
115 # handle extracted record data
123 # get the next record and wipe current parsed record
124 return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
125 my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
126 $self->{data}{crec} = { egid => undef, bib => undef, tags => undef };
128 my @fields = $record->children;
130 { $self->process_field($f) }
132 # cleanup memory and increment pointer
134 $self->{data}{rptr}++;
136 # check for required fields
137 $self->check_required;
139 return $self->{data}{crec};
143 my ($self, $field) = @_;
144 my $map = $self->{map};
145 my $tag = $field->{'att'}->{'tag'};
146 my $crec = $self->{data}{crec};
149 unless (defined $tag) {
156 my $sub = $field->first_child('subfield');
157 $crec->{egid} = $sub->text;
160 if ($map->has($tag)) {
161 push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
162 my @subs = $field->children('subfield');
164 { $self->process_subs($tag, $sub) }
165 # check map to ensure all declared subs have a value
166 my $mods = $map->mods($field);
167 for my $mappedsub ( @{ $map->subfields($tag) } ) {
168 next if $mods->{multi};
169 $crec->{tags}[-1]{uni}{$mappedsub} = ''
170 unless defined $crec->{tags}[-1]{uni}{$mappedsub};
176 my ($self, $tag, $sub) = @_;
177 my $map = $self->{map};
178 my $code = $sub->{'att'}->{'code'};
180 # handle unmapped tag/subs
181 unless ($map->has($tag, $code)) {
182 my $u = $self->{data}{umap};
183 my $s = $self->{data}{stag};
184 return unless (defined $s->{$tag});
186 # set a value, total-seen count and records-seen-in count
187 $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
188 $u->{$tag}{$code}{count}++;
189 $u->{$tag}{$code}{rcnt}++ unless ( defined $u->{$tag}{$code}{last} and
190 $u->{$tag}{$code}{last} == $self->{data}{rptr} );
191 $u->{$tag}{$code}{last} = $self->{data}{rptr};
195 # fetch our datafield struct and fieldname
196 my $dataf = $self->{data}{crec}{tags}[-1];
197 my $field = $map->field($tag, $code);
200 if (my $mods = $map->mods($field)) {
201 if ($mods->{multi}) {
202 my $name = $tag . $code;
203 push @{$dataf->{multi}{$name}}, $sub->text;
208 die "Multiple occurances of a non-multi field: $tag$code at rec ",($self->{data}{rptr} + 1),"\n"
209 if (defined $dataf->{uni}{$code});
210 $dataf->{uni}{$code} = $sub->text;
216 my $mods = $self->{map}->mods;
217 my $crec = $self->{data}{crec};
219 for my $tag_id (keys %{$mods->{required}}) {
220 for my $code (@{$mods->{required}{$tag_id}}) {
223 $found = 1 if ($crec->{bib}{($tag_id . $code)});
224 for my $tag (@{$crec->{tags}}) {
225 $found = 1 if ($tag->{multi}{($tag_id . $code)});
226 $found = 1 if ($tag->{uni}{$code});
229 die "Required mapping $tag_id$code not found in rec ",$self->{data}{rptr},"\n"
238 MapDrivenMARCXMLProc implements the following modifiers, and passes
239 them to L<Equinox::Migration::SubfieldMapper>, meaning that specifying
240 any other modifiers in a MDMP map file will cause a fatal error when
245 If a mapping is declared to be C<multi>, then MDMP expects to see more
246 than one instance of that subfield per datafield, and the data is
247 handled accordingly (see L</PARSED RECORDS> below).
249 Occurring zero or one time is legal for a C<multi> mapping.
251 A mapping which is not flagged as C<multi>, but which occurs more than
252 once per datafield will cause a fatal error.
256 The C<bib> modifier declares that a mapping is "bib-level", and should
257 be encountered once per B<record> instead of once per B<datafield> --
258 which is another way of saying that it occurs in a non-repeating
259 datafield or in a controlfield.
263 By default, if a mapping does not occur in a datafield (or record, in
264 the case of C<bib> mappings), processing continues normally. if a
265 mapping has the C<required> modifier, however, it must appear, or a
266 fatal error will occur.
268 =head1 PARSED RECORDS
272 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS);
273 $rec = $m->parse_record;
275 Then C<$rec> will look like:
278 egid => evergreen_record_id,
280 (tag_id . sub_code)1 => value1,
281 (tag_id . sub_code)2 => value2,
287 multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
288 uni => { code => value, code2 => value2, ... },
294 That is, there is an C<egid> key which points to the Evergreen ID of
295 that record, a C<bib> key which points to a hashref, and a C<tags>
296 key which points to an arrayref.
300 A reference to a hash which holds extracted data which occurs only
301 once per record (and is therefore "bib-level"; the default assumption
302 is that a tag/subfield pair can occur multiple times per record). The
303 keys are composed of tag id and subfield code, catenated
304 (e.g. 901c). The values are the contents of that subfield of that tag.
306 If there are no tags defined as bib-level in the mapfile, C<bib> will
311 A reference to a list of anonymous hashes, one for each instance of
312 each tag which occurs in the map.
314 Each tag hash holds its own id (e.g. C<998>), and two references to
315 two more hashrefs, C<multi> and C<uni>.
317 The C<multi> hash holds the extracted data for tag/sub mappings which
318 have the C<multiple> modifier on them. The keys in C<multi> are
319 composed of the tag id and subfield code, catenated
320 (e.g. C<901c>). The values are arrayrefs containing the content of all
321 instances of that subfield in that instance of that tag. If no tags
322 are defined as C<multi>, it will be C<undef>.
324 The C<uni> hash holds data for tag/sub mappings which occur only once
325 per instance of a tag (but may occur multiple times in a record due to
326 there being multiple instances of that tag in a record). Keys are
327 subfield codes and values are subfield content.
329 All C<uni> subfields occuring in the map are guaranteed to be
330 defined. Sufields which are mapped but do not occur in a particular
331 datafield will be given a value of '' (the null string) in the current
332 record struct. Oppose subfields which are not mapped, which will be
337 If the C<sample> argument is passed to L</new>, there will also be a
338 structure which holds data about unmapped subfields encountered in
339 mapped tags which are also in the declared sample set. This
340 information is collected over the life of the object and is not reset
341 for every record processed (as the current record data neccessarily
345 sub_code => { value => VALUE,
354 For each mapped tag, for each unmapped subfield, there is a hash of
355 data about that subfield containing
357 * value - A sample of the subfield text
358 * count - Total number of times the subfield was seen
359 * rcnt - The number of records the subfield was seen in
363 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
367 Please report any bugs or feature requests to the above email address.
371 You can find documentation for this module with the perldoc command.
373 perldoc Equinox::Migration::MapDrivenMARCXMLProc
376 =head1 COPYRIGHT & LICENSE
378 Copyright 2009 Equinox, all rights reserved.
380 This program is free software; you can redistribute it and/or modify it
381 under the same terms as Perl itself.
386 1; # End of Equinox::Migration::MapDrivenMARCXMLProc