1 package Equinox::Migration::MapDrivenMARCXMLProc;
7 use Equinox::Migration::SubfieldMapper 1.002;
11 # sample functionality should be extracted into a new module which
12 # uses E::M::SM to drive sampling of individual datafields, and
13 # reports ALL datafields which occur
15 # --sample should give the list of all datafields
16 # --samplefile should take a SM map as teh argument and introspect the mapped datafields
21 Equinox::Migration::MapDrivenMARCXMLProc
29 our $VERSION = '1.000';
36 use Equinox::Migration::MapDrivenMARCXMLProc;
44 Takes two required arguments: C<mapfile> (which will be passed along
45 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
46 and C<marcfile> (the MARC data to be processed).
48 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
51 There is an optional third, argument, C<sample>, which specifies a
52 arrayref of datafields to "sample" by reporting on subfields which are
53 found in the data but not in the map.
55 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
60 See L</UNMAPPED TAGS> for more info.
65 my ($class, %args) = @_;
67 my $self = bless { mods => { multi => {},
71 data => { recs => undef, # X::T record objects
72 rptr => 0, # next record pointer
73 crec => undef, # parsed record storage
74 stag => undef, # list of tags to sample
75 umap => undef, # unmapped data samples
79 # initialize map and taglist
80 die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
81 my @mods = keys %{$self->{mods}};
82 $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
84 $self->{data}{tags} = $self->{map}->tags;
87 die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
88 if (-r $args{marcfile}) {
89 $self->{twig} = XML::Twig->new;
90 $self->{twig}->parsefile($args{marcfile});
91 my @records = $self->{twig}->root->children;
92 $self->{data}{recs} = \@records;
94 die "Can't open marc file: $!\n";
97 # if we have a sample arg, set up the sample set and umap hash
98 if (defined $args{sample}) {
99 for my $s ( @{$args{sample}})
100 { $self->{data}{stag}{$s} = 1 }
101 $self->{data}{umap} = {};
110 Extracts data from the next record, per the mapping file. Returns a
111 normalized datastructure (see L</format_record> for details) on
112 success; returns 0 otherwise.
114 while (my $rec = $m->parse_record) {
115 # handle extracted record data
123 # get the next record and wipe current parsed record
124 return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
125 my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
126 $self->{data}{crec} = { egid => undef, bib => undef, tags => undef };
128 my @fields = $record->children;
130 { $self->process_field($f) }
132 # cleanup memory and increment pointer
134 $self->{data}{rptr}++;
136 return $self->{data}{crec};
140 my ($self, $field) = @_;
141 my $map = $self->{map};
142 my $tag = $field->{'att'}->{'tag'};
143 my $crec = $self->{data}{crec};
146 unless (defined $tag) {
153 my $sub = $field->first_child('subfield');
154 $crec->{egid} = $sub->text;
157 if ($map->has($tag)) {
158 push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
159 my @subs = $field->children('subfield');
161 { $self->process_subs($tag, $sub) }
162 # check map to ensure all declared subs have a value
163 my $mods = $map->mods($field);
164 for my $mappedsub ( @{ $map->subfields($tag) } ) {
165 next if $mods->{multi};
166 $crec->{tags}[-1]{uni}{$mappedsub} = ''
167 unless defined $crec->{tags}[-1]{uni}{$mappedsub};
173 my ($self, $tag, $sub) = @_;
174 my $map = $self->{map};
175 my $code = $sub->{'att'}->{'code'};
177 # handle unmapped tag/subs
178 unless ($map->has($tag, $code)) {
179 my $u = $self->{data}{umap};
180 my $s = $self->{data}{stag};
181 return unless (defined $s->{$tag});
183 # set a value, total-seen count and records-seen-in count
184 $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
185 $u->{$tag}{$code}{count}++;
186 $u->{$tag}{$code}{rcnt}++ unless ( defined $u->{$tag}{$code}{last} and
187 $u->{$tag}{$code}{last} == $self->{data}{rptr} );
188 $u->{$tag}{$code}{last} = $self->{data}{rptr};
192 # fetch our datafield struct and fieldname
193 my $dataf = $self->{data}{crec}{tags}[-1];
194 my $field = $map->field($tag, $code);
196 # handle modifiers, or slug data in normally
197 if (my $mods = $map->mods($field)) {
198 if ($mods->{multi}) {
199 my $name = $tag . $code;
200 push @{$dataf->{multi}{$name}}, $sub->text;
203 die "Multiple occurances of a non-multi field: \n"
204 if (defined $dataf->{uni}{$code});
205 $dataf->{uni}{$code} = $sub->text;
211 MapDrivenMARCXMLProc implements the following modifiers, and passes
212 them to L<Equinox::Migration::SubfieldMapper>, meaning that specifying
213 any other modifiers in a MDMP map file will cause a fatal error when
218 If a mapping is declared to be C<multi>, then MDMP expects to see more
219 than one instance of that subfield per datafield, and the data is
220 handled accordingly (see L</PARSED RECORDS> below).
222 Occurring zero or one time is legal for a C<multi> mapping.
224 A mapping which is not flagged as C<multi>, but which occurs more than
225 once per datafield will cause a fatal error.
231 =head1 PARSED RECORDS
235 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS);
236 $rec = $m->parse_record;
238 Then C<$rec> will look like:
241 egid => evergreen_record_id,
243 (tag_id . sub_code)1 => value1,
244 (tag_id . sub_code)2 => value2,
250 multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
251 uni => { code => value, code2 => value2, ... },
257 That is, there is an C<egid> key which points to the Evergreen ID of
258 that record, a C<bib> key which points to a hashref, and a C<tags>
259 key which points to an arrayref.
263 A reference to a hash which holds extracted data which occurs only
264 once per record (and is therefore "bib-level"; the default assumption
265 is that a tag/subfield pair can occur multiple times per record). The
266 keys are composed of tag id and subfield code, catenated
267 (e.g. 901c). The values are the contents of that subfield of that tag.
269 If there are no tags defined as bib-level in the mapfile, C<bib> will
274 A reference to a list of anonymous hashes, one for each instance of
275 each tag which occurs in the map.
277 Each tag hash holds its own id (e.g. C<998>), and two references to
278 two more hashrefs, C<multi> and C<uni>.
280 The C<multi> hash holds the extracted data for tag/sub mappings which
281 have the C<multiple> modifier on them. The keys in C<multi> are
282 composed of the tag id and subfield code, catenated
283 (e.g. C<901c>). The values are arrayrefs containing the content of all
284 instances of that subfield in that instance of that tag. If no tags
285 are defined as C<multi>, it will be C<undef>.
287 The C<uni> hash holds data for tag/sub mappings which occur only once
288 per instance of a tag (but may occur multiple times in a record due to
289 there being multiple instances of that tag in a record). Keys are
290 subfield codes and values are subfield content.
292 All C<uni> subfields occuring in the map are guaranteed to be
293 defined. Sufields which are mapped but do not occur in a particular
294 datafield will be given a value of '' (the null string) in the current
295 record struct. Oppose subfields which are not mapped, which will be
300 If the C<sample> argument is passed to L</new>, there will also be a
301 structure which holds data about unmapped subfields encountered in
302 mapped tags which are also in the declared sample set. This
303 information is collected over the life of the object and is not reset
304 for every record processed (as the current record data neccessarily
308 sub_code => { value => VALUE,
317 For each mapped tag, for each unmapped subfield, there is a hash of
318 data about that subfield containing
320 * value - A sample of the subfield text
321 * count - Total number of times the subfield was seen
322 * rcnt - The number of records the subfield was seen in
326 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
330 Please report any bugs or feature requests to the above email address.
334 You can find documentation for this module with the perldoc command.
336 perldoc Equinox::Migration::MapDrivenMARCXMLProc
339 =head1 COPYRIGHT & LICENSE
341 Copyright 2009 Equinox, all rights reserved.
343 This program is free software; you can redistribute it and/or modify it
344 under the same terms as Perl itself.
349 1; # End of Equinox::Migration::MapDrivenMARCXMLProc