1 package Equinox::Migration::MapDrivenMARCXMLProc;
7 use Equinox::Migration::SubfieldMapper 1.002;
11 Equinox::Migration::MapDrivenMARCXMLProc
19 our $VERSION = '1.000';
26 use Equinox::Migration::MapDrivenMARCXMLProc;
34 Takes two required arguments: C<mapfile> (which will be passed along
35 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
36 and C<marcfile> (the MARC data to be processed).
38 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
41 There is an optional third, argument, C<sample>, which specifies a
42 arrayref of datafields to "sample" by reporting on subfields which are
43 found in the data but not in the map.
45 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
50 See L</UNMAPPED TAGS> for more info.
55 my ($class, %args) = @_;
57 my $self = bless { mods => { multi => {},
61 data => { recs => undef, # X::T record objects
62 rptr => 0, # next record pointer
63 crec => undef, # parsed record storage
64 stag => undef, # list of tags to sample
65 umap => undef, # unmapped data samples
69 # initialize map and taglist
70 die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
71 my @mods = keys %{$self->{mods}};
72 $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
74 $self->{data}{tags} = $self->{map}->tags;
77 die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
78 if (-r $args{marcfile}) {
79 $self->{twig} = XML::Twig->new;
80 $self->{twig}->parsefile($args{marcfile});
81 my @records = $self->{twig}->root->children;
82 $self->{data}{recs} = \@records;
84 die "Can't open marc file: $!\n";
93 Extracts data from the next record, per the mapping file. Returns a
94 normalized datastructure (see L</format_record> for details) on
95 success; returns 0 otherwise.
97 while (my $rec = $m->parse_record) {
98 # handle extracted record data
106 # get the next record and wipe current parsed record
107 return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
108 my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
109 $self->{data}{crec} = { egid => undef, bib => undef, tags => undef };
111 my @fields = $record->children;
113 { $self->process_field($f) }
115 # cleanup memory and increment pointer
117 $self->{data}{rptr}++;
119 return $self->{data}{crec};
123 my ($self, $field) = @_;
124 my $map = $self->{map};
125 my $tag = $field->{'att'}->{'tag'};
126 my $crec = $self->{data}{crec};
129 unless (defined $tag) {
136 my $sub = $field->first_child('subfield');
137 $crec->{egid} = $sub->text;
140 if ($map->has($tag)) {
141 push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
142 my @subs = $field->children('subfield');
144 { $self->process_subs($tag, $sub) }
145 # check map to ensure all declared subs have a value
146 my $mods = $map->mods($field);
147 for my $mappedsub ( @{ $map->subfields($tag) } ) {
148 next if $mods->{multi};
149 $crec->{tags}[-1]{uni}{$mappedsub} = ''
150 unless defined $crec->{tags}[-1]{uni}{$mappedsub};
156 my ($self, $tag, $sub) = @_;
157 my $map = $self->{map};
158 my $code = $sub->{'att'}->{'code'};
160 # handle unmapped tag/subs
161 unless ($map->has($tag, $code)) {
162 my $u = $self->{data}{umap};
163 my $s = $self->{data}{stag};
164 return unless (defined $s->{$tag});
166 # set a value, total-seen count and records-seen-in count
167 $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
168 $u->{$tag}{$code}{count}++;
169 $u->{$tag}{$code}{rcnt}++ unless ($u->{$tag}{$code}{last} == $self->{data}{rptr});
170 $u->{$tag}{$code}{last} = $self->{data}{rptr};
174 # fetch our datafield struct and fieldname
175 my $dataf = $self->{data}{crec}{tags}[-1];
176 my $field = $map->field($tag, $code);
178 # handle modifiers, or slug data in normally
179 if (my $mods = $map->mods($field)) {
180 if ($mods->{multi}) {
181 my $name = $tag . $code;
182 push @{$dataf->{multi}{$name}}, $sub->text;
185 $dataf->{uni}{$code} = $sub->text;
189 =head1 PARSED RECORDS
193 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new(ARGUMENTS);
194 $rec = $m->parse_record;
196 Then C<$rec> will look like:
199 egid => evergreen_record_id,
201 (tag_id . sub_code)1 => value1,
202 (tag_id . sub_code)2 => value2,
208 multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
209 uni => { code => value, code2 => value2, ... },
215 That is, there is an C<egid> key which points to the Evergreen ID of
216 that record, a C<bib> key which points to a hashref, and a C<tags>
217 key which points to an arrayref.
221 A reference to a hash which holds extracted data which occurs only
222 once per record (and is therefore "bib-level"; the default assumption
223 is that a tag/subfield pair can occur multiple times per record). The
224 keys are composed of tag id and subfield code, catenated
225 (e.g. 901c). The values are the contents of that subfield of that tag.
227 If there are no tags defined as bib-level in the mapfile, C<bib> will
232 A reference to a list of anonymous hashes, one for each instance of
233 each tag which occurs in the map.
235 Each tag hash holds its own id (e.g. C<998>), and two references to
236 two more hashrefs, C<multi> and C<uni>.
238 The C<multi> hash holds the extracted data for tag/sub mappings which
239 have the C<multiple> modifier on them. The keys in C<multi> are
240 composed of the tag id and subfield code, catenated
241 (e.g. C<901c>). The values are arrayrefs containing the content of all
242 instances of that subfield in that instance of that tag. If no tags
243 are defined as C<multi>, it will be C<undef>.
245 The C<uni> hash holds data for tag/sub mappings which occur only once
246 per instance of a tag (but may occur multiple times in a record due to
247 there being multiple instances of that tag in a record). Keys are
248 subfield codes and values are subfield content.
250 All C<uni> subfields occuring in the map are guaranteed to be
251 defined. Sufields which are mapped but do not occur in a particular
252 datafield will be given a value of '' (the null string) in the current
253 record struct. Oppose subfields which are not mapped, which will be
259 sub_code => { value => VALUE, count => COUNT },
260 sub_code2 => { value => VALUE, count => COUNT },
268 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
272 Please report any bugs or feature requests to the above email address.
276 You can find documentation for this module with the perldoc command.
278 perldoc Equinox::Migration::MapDrivenMARCXMLProc
281 =head1 COPYRIGHT & LICENSE
283 Copyright 2009 Equinox, all rights reserved.
285 This program is free software; you can redistribute it and/or modify it
286 under the same terms as Perl itself.
291 1; # End of Equinox::Migration::MapDrivenMARCXMLProc