1 package Equinox::Migration::MARCXMLSampler;
7 use Equinox::Migration::SimpleTagList 1.001;
12 Equinox::Migration::MARCXMLSampler
20 our $VERSION = '1.003';
29 Produce a list of all fields in a MARCXML file which have a C<tag>
30 attribute, and count how many times each occurs
32 my $s = E::M::MARCXMLSampler->new( marcfile => "foo.marc.xml" );
35 Also deeply introspect certain tags, producing lists of all subfields,
36 and counts of how many times each subfield occurs I<in toto> and how
37 many records each subfield appears in
39 my $s = E::M::MARCXMLSampler->new( marcfile => "foo.marc.xml",
40 mapfile => "foo.map" );
43 my $s = E::M::MARCXMLSampler->new( marcfile => "foo.marc.xml",
44 mapstring => "852 999" );
53 Takes one required argument, C<marcfile>, which points to the MARCXML
56 Has two mutually-exclusive optional arguments, C<mapfile> and
57 C<mapstring>". The former should point to a file which will be used as
58 a L<Equinox::Migration::SimpleTagList> map; the latter should have as
59 its value a text string which will be used in the same way (handy for
60 when you only want deep introspection on a handful of tags).
65 my ($class, %args) = @_;
67 $dstore = { rcnt => 0, # record counter
68 tcnt => 0, # tag counter
69 scnt => {}, # subfield/tag counters
70 samp => {}, # data samples
71 tags => {}, # all found tags
74 my $self = bless { data => $dstore,
78 die "Argument 'marcfile' must be specified\n" unless ($args{marcfile});
79 if (-r $args{marcfile}) {
80 $xmltwig = XML::Twig->new( twig_handlers => { record => \&parse_record } );
81 $self->{conf}{marc} = $args{marcfile};
83 die "Can't open marc file: $!\n";
86 # if we have a sample arg, create the sample map
87 die "Can't use a mapfile and mapstring\n"
88 if ($args{mapfile} and $args{mapstring});
89 $taglist = Equinox::Migration::SimpleTagList->new(file => $args{mapfile})
91 $taglist = Equinox::Migration::SimpleTagList->new(str => $args{mapstring})
92 if ($args{mapstring});
94 # do the xml processing
95 $xmltwig->parsefile( $self->{conf}{marc} );
97 # hand ourselves back for
104 XML::Twig handler for record elements; drives data extraction process.
109 my ($twig, $record) = @_;
111 my @fields = $record->children;
113 { process_field($f) }
115 # cleanup memory and increment pointer
123 my $tag = $field->{'att'}->{'tag'};
124 return unless ($tag and $tag > 9);
126 # increment raw tag count
128 $dstore->{tags}{$tag}++;
131 if ($taglist and $taglist->has($tag)) {
132 my @subs = $field->children('subfield');
135 { process_subs($tag, $sub); $i++ }
137 # increment sub length counter
138 $dstore->{scnt}{$tag}{$i}++;
143 my ($tag, $sub) = @_;
144 my $code = $sub->{'att'}->{'code'};
146 # handle unmapped tag/subs
147 my $samp = $dstore->{samp};
148 # set a value, total-seen count and records-seen-in count
149 $samp->{$tag}{$code}{value} = $sub->text unless $samp->{$tag}{$code};
150 $samp->{$tag}{$code}{count}++;
151 $samp->{$tag}{$code}{tcnt}++ unless ( defined $samp->{$tag}{$code}{last} and
152 $samp->{$tag}{$code}{last} == $dstore->{tcnt} );
153 $samp->{$tag}{$code}{last} = $dstore->{tcnt};
159 If the C<mapfile> or C<mapstring> arguments are passed to L</new>, a
160 structure will be constructed which holds data about tags in the map.
163 sub_code => { value => VALUE,
172 For each subfield in each mapped tag, there is a hash of data about
173 that subfield containing
175 * value - A sample of the subfield text
176 * count - Total number of times the subfield was seen
177 * tcnt - The number of tags the subfield was seen in
181 Shawn Boyette, C<< <sboyette at esilibrary.com> >>
185 Please report any bugs or feature requests to the above email address.
189 You can find documentation for this module with the perldoc command.
191 perldoc Equinox::Migration::MARCXMLSampler
194 =head1 COPYRIGHT & LICENSE
196 Copyright 2009 Equinox, all rights reserved.
198 This program is free software; you can redistribute it and/or modify it
199 under the same terms as Perl itself.
204 1; # End of Equinox::Migration::MARCXMLSampler