1 package Equinox::Migration::MapDrivenMARCXMLProc;
7 use Equinox::Migration::SubfieldMapper 1.002;
11 Equinox::Migration::MapDrivenMARCXMLProc
19 our $VERSION = '1.000';
26 use Equinox::Migration::MapDrivenMARCXMLProc;
34 Takes two required arguments: C<mapfile> (which will be passed along
35 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
36 and C<marcfile> (the MARC data to be processed).
38 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
41 There is an optional third, argument, C<sample>, which specifies a
42 arrayref of datafields to "sample" by reporting on subfields which are
43 found in the data but not in the map.
45 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
50 See L</UNMAPPED TAGS> for more info.
55 my ($class, %args) = @_;
57 my $self = bless { mods => { multi => {},
61 data => { recs => undef, # X::T record objects
62 rptr => 0, # next record pointer
63 crec => undef, # parsed record storage
64 stag => undef, # list of tags to sample
65 umap => undef, # unmapped data samples
69 # initialize map and taglist
70 die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
71 my @mods = keys %{$self->{mods}};
72 $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
74 $self->{data}{tags} = $self->{map}->tags;
77 die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
78 if (-r $args{marcfile}) {
79 $self->{twig} = XML::Twig->new;
80 $self->{twig}->parsefile($args{marcfile});
81 my @records = $self->{twig}->root->children;
82 $self->{data}{recs} = \@records;
84 die "Can't open marc file: $!\n";
87 # if we have a sample arg, set up the sample set and umap hash
88 if (defined $args{sample}) {
89 for my $s ( @{$args{sample}})
90 { $self->{data}{stag}{$s} = 1 }
91 $self->{data}{umap} = {};
100 Extracts data from the next record, per the mapping file. Returns a
101 normalized datastructure (see L</format_record> for details) on
102 success; returns 0 otherwise.
104 while (my $rec = $m->parse_record) {
105 # handle extracted record data
113 # get the next record and wipe current parsed record
114 return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
115 my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
116 $self->{data}{crec} = { egid => undef, bib => undef, tags => undef };
118 my @fields = $record->children;
120 { $self->process_field($f) }
122 # cleanup memory and increment pointer
124 $self->{data}{rptr}++;
126 return $self->{data}{crec};
130 my ($self, $field) = @_;
131 my $map = $self->{map};
132 my $tag = $field->{'att'}->{'tag'};
133 my $crec = $self->{data}{crec};
136 unless (defined $tag) {
143 my $sub = $field->first_child('subfield');
144 $crec->{egid} = $sub->text;
147 if ($map->has($tag)) {
148 push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
149 my @subs = $field->children('subfield');
151 { $self->process_subs($tag, $sub) }
152 # check map to ensure all declared subs have a value
153 my $mods = $map->mods($field);
154 for my $mappedsub ( @{ $map->subfields($tag) } ) {
155 next if $mods->{multi};
156 $crec->{tags}[-1]{uni}{$mappedsub} = ''
157 unless defined $crec->{tags}[-1]{uni}{$mappedsub};
163 my ($self, $tag, $sub) = @_;
164 my $map = $self->{map};
165 my $code = $sub->{'att'}->{'code'};
167 # handle unmapped tag/subs
168 unless ($map->has($tag, $code)) {
169 my $u = $self->{data}{umap};
170 my $s = $self->{data}{stag};
171 return unless (defined $s->{$tag});
173 # set a value, total-seen count and records-seen-in count
174 $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
175 $u->{$tag}{$code}{count}++;
176 $u->{$tag}{$code}{rcnt}++ unless ( defined $u->{$tag}{$code}{last} and
177 $u->{$tag}{$code}{last} == $self->{data}{rptr} );
178 $u->{$tag}{$code}{last} = $self->{data}{rptr};
182 # fetch our datafield struct and fieldname
183 my $dataf = $self->{data}{crec}{tags}[-1];
184 my $field = $map->field($tag, $code);
186 # handle modifiers, or slug data in normally
187 if (my $mods = $map->mods($field)) {
188 if ($mods->{multi}) {
189 my $name = $tag . $code;
190 push @{$dataf->{multi}{$name}}, $sub->text;
193 $dataf->{uni}{$code} = $sub->text;
197 =head1 PARSED RECORDS
201 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS);
202 $rec = $m->parse_record;
204 Then C<$rec> will look like:
207 egid => evergreen_record_id,
209 (tag_id . sub_code)1 => value1,
210 (tag_id . sub_code)2 => value2,
216 multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
217 uni => { code => value, code2 => value2, ... },
223 That is, there is an C<egid> key which points to the Evergreen ID of
224 that record, a C<bib> key which points to a hashref, and a C<tags>
225 key which points to an arrayref.
229 A reference to a hash which holds extracted data which occurs only
230 once per record (and is therefore "bib-level"; the default assumption
231 is that a tag/subfield pair can occur multiple times per record). The
232 keys are composed of tag id and subfield code, catenated
233 (e.g. 901c). The values are the contents of that subfield of that tag.
235 If there are no tags defined as bib-level in the mapfile, C<bib> will
240 A reference to a list of anonymous hashes, one for each instance of
241 each tag which occurs in the map.
243 Each tag hash holds its own id (e.g. C<998>), and two references to
244 two more hashrefs, C<multi> and C<uni>.
246 The C<multi> hash holds the extracted data for tag/sub mappings which
247 have the C<multiple> modifier on them. The keys in C<multi> are
248 composed of the tag id and subfield code, catenated
249 (e.g. C<901c>). The values are arrayrefs containing the content of all
250 instances of that subfield in that instance of that tag. If no tags
251 are defined as C<multi>, it will be C<undef>.
253 The C<uni> hash holds data for tag/sub mappings which occur only once
254 per instance of a tag (but may occur multiple times in a record due to
255 there being multiple instances of that tag in a record). Keys are
256 subfield codes and values are subfield content.
258 All C<uni> subfields occuring in the map are guaranteed to be
259 defined. Sufields which are mapped but do not occur in a particular
260 datafield will be given a value of '' (the null string) in the current
261 record struct. Oppose subfields which are not mapped, which will be
266 If the C<sample> argument is passed to L</new>, there will also be a
267 structure which holds data about unmapped subfields encountered in
268 mapped tags which are also in the declared sample set. This
269 information is collected over the life of the object and is not reset
270 for every record processed (as the current record data neccessarily
274 sub_code => { value => VALUE,
283 For each mapped tag, for each unmapped subfield, there is a hash of
284 data about that subfield containing
286 * value - A sample of the subfield text
287 * count - Total number of times the subfield was seen
288 * rcnt - The number of records the subfield was seen in
292 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
296 Please report any bugs or feature requests to the above email address.
300 You can find documentation for this module with the perldoc command.
302 perldoc Equinox::Migration::MapDrivenMARCXMLProc
305 =head1 COPYRIGHT & LICENSE
307 Copyright 2009 Equinox, all rights reserved.
309 This program is free software; you can redistribute it and/or modify it
310 under the same terms as Perl itself.
315 1; # End of Equinox::Migration::MapDrivenMARCXMLProc