1 package Equinox::Migration::MapDrivenMARCXMLProc;
7 use Equinox::Migration::SubfieldMapper;
11 Equinox::Migration::MapDrivenMARCXMLProc
19 our $VERSION = '1.000';
26 use Equinox::Migration::MapDrivenMARCXMLProc;
34 Takes two required arguments: C<mapfile> (which will be passed along
35 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
36 and C<marcfile> (the MARC data to be processed).
38 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
41 There is an optional third, argument, C<sample>, which specifies a
42 arrayref of datafields to "sample" by reporting on subfields which are
43 found in the data but not in the map.
45 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
50 See L</UNMAPPED TAGS> for more info.
55 my ($class, %args) = @_;
57 my $self = bless { mods => { multi => {},
61 data => { recs => undef, # X::T record objects
62 rptr => 0, # next record pointer
63 crec => undef, # parsed record storage
64 stag => undef, # list of tags to sample
65 umap => undef, # unmapped data samples
69 # initialize map and taglist
70 die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
71 my @mods = keys %{$self->{mods}};
72 $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
74 $self->{data}{tags} = $self->{map}->tags;
77 die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
78 if (-r $args{marcfile}) {
79 $self->{twig} = XML::Twig->new;
80 $self->{twig}->parsefile($args{marcfile});
81 my @records = $self->{twig}->root->children;
82 $self->{data}{recs} = \@records;
84 die "Can't open marc file: $!\n";
93 Extracts data from the next record, per the mapping file. Returns a
94 normalized datastructure (see L</format_record> for details) on
95 success; returns 0 otherwise.
97 while (my $rec = $m->parse_record) {
98 # handle extracted record data
106 # get the next record and wipe current parsed record
107 return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
108 my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
109 $self->{data}{crec} = { egid => undef, bib => undef, tags => undef };
111 my @fields = $record->children;
113 { $self->process_field($f) }
115 # cleanup memory and increment pointer
117 $self->{data}{rptr}++;
119 return $self->{data}{crec};
127 my ($self, $field) = @_;
128 my $map = $self->{map};
129 my $tag = $field->{'att'}->{'tag'};
130 my $crec = $self->{data}{crec};
135 my $sub = $field->first_child('subfield');
136 $crec->{egid} = $sub->text;;
137 } elsif ($map->has($tag)) {
138 push @{$crec->{tags}}, { tag => $tag, uni => undef, multi => undef };
139 my @subs = $field->children('subfield');
141 { $self->process_subs($tag, $sub) }
142 # check map to ensure all declared subs are in
152 my ($self, $tag, $sub) = @_;
153 my $map = $self->{map};
154 my $code = $sub->{'att'}->{'code'};
156 # handle unmapped tag/subs
157 unless ($map->has($tag, $code)) {
158 my $u = $self->{data}{umap};
159 my $s = $self->{data}{stag};
160 return unless (defined $s->{$tag});
162 # set a value, total-seen count and records-seen-in count
163 $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
164 $u->{$tag}{$code}{count}++;
165 $u->{$tag}{$code}{rcnt}++ unless ($u->{$tag}{$code}{last} == $self->{data}{rptr});
166 $u->{$tag}{$code}{last} = $self->{data}{rptr};
170 my $dataf = $self->{data}{crec}{tags}[-1];
171 my $field = $map->field($tag, $code);
174 if (defined $map->mods($field)) {
175 if ($map->mods($field) eq 'multi') {
176 my $name = $tag . $code;
177 push @{$dataf->{multi}{$name}}, $sub->text;
181 $dataf->{uni}{$code} = $sub->text;
184 =head1 PARSED RECORDS
187 egid => evergreen_record_id,
189 (tag_id . sub_code)1 => value1,
190 (tag_id . sub_code)2 => value2,
196 multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
197 uni => { code => value, code2 => value2, ... },
203 That is, there is an C<egid> key which points to the Evergreen ID of
204 that record, a C<bib> key which points to a hashref, and a C<tags>
205 key which points to an arrayref.
209 This hashref holds extracted data which should occur once per record
210 (the default assumption is that a tag/subfield pair can occur multiple
211 times per record). The keys are composed of tag id and subfield code,
212 catenated (e.g. 901c). The values are the contents of that subfield of
215 If there are no tags defined as bib-level, C<bib> will be C<undef>.
219 This arrayref holds anonymous hashrefs, one for each instance of each
220 tag which occurs in the map. Each tag hashref holds its own id
221 (e.g. C<998>), and two more hashrefs, C<multi> and C<uni>.
223 The C<multi> hashref holds the extracted data for tag/sub mappings
224 which have the C<multiple> modifier on them. The keys in C<multi> are
225 composed of the tag id and subfield code, catenated
226 (e.g. C<901c>). The values are arrayrefs containing the content of all
227 instances of that subfield in that instance of that tag.
229 The C<uni> hashref holds data for tag/sub mappings which occur only
230 once per instance of a tag (but may occur multiple times in a record
231 due to there being multiple instances of that tag in a record). Keys
232 are subfield codes and values are subfield content.
234 If no tags are defined as C<multi>, it will be C<undef>.
239 sub_code => { value => VALUE, count => COUNT },
240 sub_code2 => { value => VALUE, count => COUNT },
248 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
252 Please report any bugs or feature requests to the above email address.
256 You can find documentation for this module with the perldoc command.
258 perldoc Equinox::Migration::MapDrivenMARCXMLProc
261 =head1 COPYRIGHT & LICENSE
263 Copyright 2009 Equinox, all rights reserved.
265 This program is free software; you can redistribute it and/or modify it
266 under the same terms as Perl itself.
271 1; # End of Equinox::Migration::MapDrivenMARCXMLProc