1 package Equinox::Migration::MapDrivenMARCXMLProc;
7 use Equinox::Migration::SubfieldMapper;
11 Equinox::Migration::MapDrivenMARCXMLProc
19 our $VERSION = '1.000';
26 use Equinox::Migration::MapDrivenMARCXMLProc;
34 Takes two required arguments: C<mapfile> (which will be passed along
35 to L<Equinox::Migration::SubfieldMapper> as the basis for its map),
36 and C<marcfile> (the MARC data to be processed).
38 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
41 There is an optional third, argument, C<sample>, which specifies a
42 arrayref of datafields to "sample" by reporting on subfields which are
43 found in the data but not in the map.
45 my $m = Equinox::Migration::MapDrivenMARCXMLProc->new( mapfile => FILE,
50 See L</UNMAPPED TAGS> for more info.
55 my ($class, %args) = @_;
57 my $self = bless { mods => { multi => {},
61 data => { recs => undef, # X::T record objects
62 rptr => 0, # next record pointer
63 crec => undef, # parsed record storage
64 stag => undef, # list of tags to sample
65 umap => undef, # unmapped data samples
69 # initialize map and taglist
70 die "Argument 'mapfile' must be specified\n" unless (defined $args{mapfile});
71 my @mods = keys %{$self->{mods}};
72 $self->{map} = Equinox::Migration::SubfieldMapper->new( file => $args{mapfile},
74 $self->{data}{tags} = $self->{map}->tags;
77 die "Argument 'marcfile' must be specified\n" unless (defined $args{marcfile});
78 if (-r $args{marcfile}) {
79 $self->{twig} = XML::Twig->new;
80 $self->{twig}->parsefile($args{marcfile});
81 my @records = $self->{twig}->root->children;
82 $self->{data}{recs} = \@records;
84 die "Can't open marc file: $!\n";
93 Extracts data from the next record, per the mapping file. Returns a
94 normalized datastructure (see L</format_record> for details) on
95 success; returns 0 otherwise.
97 while (my $rec = $m->parse_record) {
98 # handle extracted record data
106 # get the next record and wipe current parsed record
107 return 0 unless defined $self->{data}{recs}[ $self->{data}{rptr} ];
108 my $record = $self->{data}{recs}[ $self->{data}{rptr} ];
109 $self->{data}{crec} = { bib => undef, multi => undef };
111 my @fields = $record->children;
113 { $self->process_field($f) }
115 # cleanup memory and increment pointer
117 $self->{data}{rptr}++;
119 return $self->format_record;
127 my ($self, $field) = @_;
128 my $map = $self->{map};
129 my $tag = $field->{'att'}->{'tag'};
130 my $parsed = $self->{data}{crec};
135 my $sub = $field->first_child('subfield');
136 $parsed->{egid} = $sub->text;;
137 } elsif ($map->has($tag)) {
138 push @{$parsed->{tags}}, { tag => $tag };
139 my @subs = $field->children('subfield');
141 { $self->process_subs($tag, $sub) }
151 my ($self, $tag, $sub) = @_;
152 my $map = $self->{map};
153 my $code = $sub->{'att'}->{'code'};
155 # handle unmapped tag/subs
156 unless ($map->has($tag, $code)) {
157 my $u = $self->{data}{umap};
158 my $s = $self->{data}{stag};
159 return unless (defined $s->{$tag});
161 $u->{$tag}{$code}{value} = $sub->text unless defined $u->{$tag}{$code};
162 $u->{$tag}{$code}{count}++;
166 my $data = $self->{data}{crec}{tags}[-1];
167 my $field = $map->field($tag, $code);
170 if (defined $map->mods($field)) {
171 if ($map->mods($field) eq 'multi') {
172 my $name = $tag . $code;
173 push @{$data->{multi}{$name}}, $sub->text;
177 $data->{uni}{$code} = $sub->text;
180 =head1 PARSED RECORDS
183 egid => evergreen_record_id,
185 (tag_id . sub_code)1 => value1,
186 (tag_id . sub_code)2 => value2,
192 multi => { (tag_id . sub_code) => [ val1, val2, ... ] },
193 uni => { code => value, code2 => value2, ... },
199 That is, there is an C<egid> key which points to the Evergreen ID of
200 that record, a C<bib> key which points to a hashref, and a C<tags>
201 key which points to an arrayref.
205 This hashref holds extracted data which should occur once per record
206 (the default assumption is that a tag/subfield pair can occur multiple
207 times per record). The keys are composed of tag id and subfield code,
208 catenated (e.g. 901c). The values are the contents of that subfield of
211 If there are no tags defined as bib-level, C<bib> will be C<undef>.
215 This arrayref holds anonymous hashrefs, one for each instance of each
216 tag which occurs in the map. Each tag hashref holds its own id
217 (e.g. C<998>), and two more hashrefs, C<multi> and C<uni>.
219 The C<multi> hashref holds the extracted data for tag/sub mappings
220 which have the C<multiple> modifier on them. The keys in C<multi> are
221 composed of the tag id and subfield code, catenated
222 (e.g. C<901c>). The values are arrayrefs containing the content of all
223 instances of that subfield in that instance of that tag.
225 The C<uni> hashref holds data for tag/sub mappings which occur only
226 once per instance of a tag (but may occur multiple times in a record
227 due to there being multiple instances of that tag in a record). Keys
228 are subfield codes and values are subfield content.
230 If no tags are defined as C<multi>, it will be C<undef>.
235 sub_code => { value => VALUE, count => COUNT },
236 sub_code2 => { value => VALUE, count => COUNT },
244 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
248 Please report any bugs or feature requests to the above email address.
252 You can find documentation for this module with the perldoc command.
254 perldoc Equinox::Migration::MapDrivenMARCXMLProc
257 =head1 COPYRIGHT & LICENSE
259 Copyright 2009 Equinox, all rights reserved.
261 This program is free software; you can redistribute it and/or modify it
262 under the same terms as Perl itself.
267 1; # End of Equinox::Migration::MapDrivenMARCXMLProc