1 package Equinox::Migration::MARCXMLSampler;
7 use Equinox::Migration::SimpleTagList 1.001;
11 # sample functionality should be extracted into a new module which
12 # uses E::M::SM to drive sampling of individual datafields, and
13 # reports ALL datafields which occur
15 # --sample should give the list of all datafields
16 # --samplefile should take a SM map as teh argument and introspect the mapped datafields
21 Equinox::Migration::MARCXMLSampler
29 our $VERSION = '1.000';
36 use Equinox::Migration::MARCXMLSampler;
47 my ($class, %args) = @_;
49 my $self = bless { data => { recs => undef, # X::T record objects
50 rcnt => 0, # next record counter
51 samp => {}, # data samples
52 tags => {}, # all found tags
57 die "Argument 'marcfile' must be specified\n" unless ($args{marcfile});
58 if (-r $args{marcfile}) {
59 $self->{twig} = XML::Twig->new;
60 $self->{twig}->parsefile($args{marcfile});
61 my @records = $self->{twig}->root->children;
62 $self->{data}{recs} = \@records;
64 die "Can't open marc file: $!\n";
67 # if we have a sample arg, create the sample map
68 $self->{map} = Equinox::Migration::SimpleTagList->new(file => $args{mapfile})
70 $self->{map} = Equinox::Migration::SimpleTagList->new(str => $args{mapstring})
71 if ($args{mapstring});
79 Extracts data from MARC records, per the mapping file.
86 for my $record ( @{$self->{data}{recs}} ) {
87 my @fields = $record->children;
89 { $self->process_field($f) }
91 # cleanup memory and increment pointer
93 $self->{data}{rcnt}++;
98 my ($self, $field) = @_;
99 my $map = $self->{map};
100 my $tag = $field->{'att'}->{'tag'};
101 return unless ($tag and $tag > 9);
103 # increment raw tag count
104 $self->{data}{tags}{$tag}++;
106 if ($map and $map->has($tag)) {
107 my @subs = $field->children('subfield');
109 { $self->process_subs($tag, $sub) }
114 my ($self, $tag, $sub) = @_;
115 my $map = $self->{map};
116 my $code = $sub->{'att'}->{'code'};
118 # handle unmapped tag/subs
119 my $samp = $self->{data}{samp};
120 # set a value, total-seen count and records-seen-in count
121 $samp->{$tag}{$code}{value} = $sub->text unless defined $samp->{$tag}{$code};
122 $samp->{$tag}{$code}{count}++;
123 $samp->{$tag}{$code}{rcnt}++ unless ( defined $samp->{$tag}{$code}{last} and
124 $samp->{$tag}{$code}{last} == $self->{data}{rcnt} );
125 $samp->{$tag}{$code}{last} = $self->{data}{rcnt};
131 If the C<sample> argument is passed to L</new>, there will also be a
132 structure which holds data about unmapped subfields encountered in
133 mapped tags which are also in the declared sample set. This
134 information is collected over the life of the object and is not reset
135 for every record processed (as the current record data neccessarily
139 sub_code => { value => VALUE,
148 For each mapped tag, for each unmapped subfield, there is a hash of
149 data about that subfield containing
151 * value - A sample of the subfield text
152 * count - Total number of times the subfield was seen
153 * rcnt - The number of records the subfield was seen in
157 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
161 Please report any bugs or feature requests to the above email address.
165 You can find documentation for this module with the perldoc command.
167 perldoc Equinox::Migration::MARCXMLSampler
170 =head1 COPYRIGHT & LICENSE
172 Copyright 2009 Equinox, all rights reserved.
174 This program is free software; you can redistribute it and/or modify it
175 under the same terms as Perl itself.
180 1; # End of Equinox::Migration::MARCXMLSampler